From 333ab7d08b320e9b41e837a1be1602d8ea5a0df2 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 13 Nov 2018 07:47:52 +0000 Subject: [PATCH] [X86] Add more tests for -x86-experimental-vector-widening-legalization I'm looking into whether we can make this the default legalization strategy. Adding these tests to help cover the changes that will be necessary. This patch adds copies of some tests with the command line switch enabled. By making copies its easier to compare the two legalization strategies. I've also removed RUN lines from some of these tests that already had -x86-experimental-vector-widening-legalization llvm-svn: 346745 --- llvm/test/CodeGen/X86/avx512-cvt-widen.ll | 2494 ++++++++ llvm/test/CodeGen/X86/avx512-cvt.ll | 1124 ---- llvm/test/CodeGen/X86/shrink_vmul-widen.ll | 2730 ++++++++ llvm/test/CodeGen/X86/vec_fp_to_int-widen.ll | 2662 ++++++++ llvm/test/CodeGen/X86/vec_fp_to_int.ll | 586 -- llvm/test/CodeGen/X86/vec_int_to_fp-widen.ll | 5565 ++++++++++++++++ llvm/test/CodeGen/X86/vector-sext-widen.ll | 5095 +++++++++++++++ .../CodeGen/X86/vector-trunc-math-widen.ll | 5690 +++++++++++++++++ .../CodeGen/X86/vector-trunc-packus-widen.ll | 3306 ++++++++++ .../CodeGen/X86/vector-trunc-ssat-widen.ll | 3243 ++++++++++ .../CodeGen/X86/vector-trunc-usat-widen.ll | 2587 ++++++++ llvm/test/CodeGen/X86/vector-trunc-widen.ll | 2223 +++++++ llvm/test/CodeGen/X86/vector-zext-widen.ll | 2294 +++++++ 13 files changed, 37889 insertions(+), 1710 deletions(-) create mode 100644 llvm/test/CodeGen/X86/avx512-cvt-widen.ll create mode 100644 llvm/test/CodeGen/X86/shrink_vmul-widen.ll create mode 100644 llvm/test/CodeGen/X86/vec_fp_to_int-widen.ll create mode 100644 llvm/test/CodeGen/X86/vec_int_to_fp-widen.ll create mode 100644 llvm/test/CodeGen/X86/vector-sext-widen.ll create mode 100644 llvm/test/CodeGen/X86/vector-trunc-math-widen.ll create mode 100644 llvm/test/CodeGen/X86/vector-trunc-packus-widen.ll create mode 100644 llvm/test/CodeGen/X86/vector-trunc-ssat-widen.ll create mode 100644 llvm/test/CodeGen/X86/vector-trunc-usat-widen.ll create mode 100644 llvm/test/CodeGen/X86/vector-trunc-widen.ll create mode 100644 llvm/test/CodeGen/X86/vector-zext-widen.ll diff --git a/llvm/test/CodeGen/X86/avx512-cvt-widen.ll b/llvm/test/CodeGen/X86/avx512-cvt-widen.ll new file mode 100644 index 000000000000..0ea613184ab3 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512-cvt-widen.ll @@ -0,0 +1,2494 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=KNL +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLBW --check-prefix=SKX +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLNOBW --check-prefix=AVX512VL +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=DQNOVL --check-prefix=AVX512DQ +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=AVX512BW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLNOBW --check-prefix=AVX512VLDQ +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLBW --check-prefix=AVX512VLBW + + +define <16 x float> @sitof32(<16 x i32> %a) nounwind { +; ALL-LABEL: sitof32: +; ALL: # %bb.0: +; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 +; ALL-NEXT: retq + %b = sitofp <16 x i32> %a to <16 x float> + ret <16 x float> %b +} + +define <8 x double> @sltof864(<8 x i64> %a) { +; NODQ-LABEL: sltof864: +; NODQ: # %bb.0: +; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm1 +; NODQ-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; NODQ-NEXT: vmovq %xmm1, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3 +; NODQ-NEXT: vmovq %xmm2, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm2 +; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3 +; NODQ-NEXT: vmovq %xmm2, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm0 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: sltof864: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtqq2pd %zmm0, %zmm0 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: sltof864: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvtqq2pd %zmm0, %zmm0 +; DQNOVL-NEXT: retq + %b = sitofp <8 x i64> %a to <8 x double> + ret <8 x double> %b +} + +define <4 x double> @slto4f64(<4 x i64> %a) { +; NODQ-LABEL: slto4f64: +; NODQ: # %bb.0: +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NODQ-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; NODQ-NEXT: vmovq %xmm1, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: slto4f64: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtqq2pd %ymm0, %ymm0 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: slto4f64: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; DQNOVL-NEXT: vcvtqq2pd %zmm0, %zmm0 +; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; DQNOVL-NEXT: retq + %b = sitofp <4 x i64> %a to <4 x double> + ret <4 x double> %b +} + +define <2 x double> @slto2f64(<2 x i64> %a) { +; NODQ-LABEL: slto2f64: +; NODQ: # %bb.0: +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; NODQ-NEXT: retq +; +; VLDQ-LABEL: slto2f64: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtqq2pd %xmm0, %xmm0 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: slto2f64: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; DQNOVL-NEXT: vcvtqq2pd %zmm0, %zmm0 +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; DQNOVL-NEXT: vzeroupper +; DQNOVL-NEXT: retq + %b = sitofp <2 x i64> %a to <2 x double> + ret <2 x double> %b +} + +define <2 x float> @sltof2f32(<2 x i64> %a) { +; NODQ-LABEL: sltof2f32: +; NODQ: # %bb.0: +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 +; NODQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; NODQ-NEXT: retq +; +; VLDQ-LABEL: sltof2f32: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtqq2ps %xmm0, %xmm0 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: sltof2f32: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; DQNOVL-NEXT: vcvtqq2ps %zmm0, %ymm0 +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; DQNOVL-NEXT: vzeroupper +; DQNOVL-NEXT: retq + %b = sitofp <2 x i64> %a to <2 x float> + ret <2 x float>%b +} + +define <4 x float> @slto4f32_mem(<4 x i64>* %a) { +; NODQ-LABEL: slto4f32_mem: +; NODQ: # %bb.0: +; NODQ-NEXT: vmovdqu (%rdi), %xmm0 +; NODQ-NEXT: vmovdqu 16(%rdi), %xmm1 +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; NODQ-NEXT: vmovq %xmm1, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; NODQ-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; NODQ-NEXT: retq +; +; VLDQ-LABEL: slto4f32_mem: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtqq2psy (%rdi), %xmm0 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: slto4f32_mem: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vmovups (%rdi), %ymm0 +; DQNOVL-NEXT: vcvtqq2ps %zmm0, %ymm0 +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; DQNOVL-NEXT: vzeroupper +; DQNOVL-NEXT: retq + %a1 = load <4 x i64>, <4 x i64>* %a, align 8 + %b = sitofp <4 x i64> %a1 to <4 x float> + ret <4 x float>%b +} + +define <4 x i64> @f64to4sl(<4 x double> %a) { +; NODQ-LABEL: f64to4sl: +; NODQ: # %bb.0: +; NODQ-NEXT: vextractf128 $1, %ymm0, %xmm1 +; NODQ-NEXT: vcvttsd2si %xmm1, %rax +; NODQ-NEXT: vmovq %rax, %xmm2 +; NODQ-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; NODQ-NEXT: vcvttsd2si %xmm1, %rax +; NODQ-NEXT: vmovq %rax, %xmm1 +; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; NODQ-NEXT: vcvttsd2si %xmm0, %rax +; NODQ-NEXT: vmovq %rax, %xmm2 +; NODQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; NODQ-NEXT: vcvttsd2si %xmm0, %rax +; NODQ-NEXT: vmovq %rax, %xmm0 +; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; NODQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: f64to4sl: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttpd2qq %ymm0, %ymm0 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: f64to4sl: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; DQNOVL-NEXT: vcvttpd2qq %zmm0, %zmm0 +; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; DQNOVL-NEXT: retq + %b = fptosi <4 x double> %a to <4 x i64> + ret <4 x i64> %b +} + +define <4 x i64> @f32to4sl(<4 x float> %a) { +; NODQ-LABEL: f32to4sl: +; NODQ: # %bb.0: +; NODQ-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; NODQ-NEXT: vcvttss2si %xmm1, %rax +; NODQ-NEXT: vmovq %rax, %xmm1 +; NODQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; NODQ-NEXT: vcvttss2si %xmm2, %rax +; NODQ-NEXT: vmovq %rax, %xmm2 +; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; NODQ-NEXT: vcvttss2si %xmm0, %rax +; NODQ-NEXT: vmovq %rax, %xmm2 +; NODQ-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; NODQ-NEXT: vcvttss2si %xmm0, %rax +; NODQ-NEXT: vmovq %rax, %xmm0 +; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; NODQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: f32to4sl: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttps2qq %xmm0, %ymm0 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: f32to4sl: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; DQNOVL-NEXT: vcvttps2qq %ymm0, %zmm0 +; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; DQNOVL-NEXT: retq + %b = fptosi <4 x float> %a to <4 x i64> + ret <4 x i64> %b +} + +define <4 x float> @slto4f32(<4 x i64> %a) { +; NODQ-LABEL: slto4f32: +; NODQ: # %bb.0: +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; NODQ-NEXT: vzeroupper +; NODQ-NEXT: retq +; +; VLDQ-LABEL: slto4f32: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0 +; VLDQ-NEXT: vzeroupper +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: slto4f32: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; DQNOVL-NEXT: vcvtqq2ps %zmm0, %ymm0 +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; DQNOVL-NEXT: vzeroupper +; DQNOVL-NEXT: retq + %b = sitofp <4 x i64> %a to <4 x float> + ret <4 x float> %b +} + +define <4 x float> @ulto4f32(<4 x i64> %a) { +; NODQ-LABEL: ulto4f32: +; NODQ: # %bb.0: +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; NODQ-NEXT: vzeroupper +; NODQ-NEXT: retq +; +; VLDQ-LABEL: ulto4f32: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtuqq2ps %ymm0, %xmm0 +; VLDQ-NEXT: vzeroupper +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: ulto4f32: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; DQNOVL-NEXT: vcvtuqq2ps %zmm0, %ymm0 +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; DQNOVL-NEXT: vzeroupper +; DQNOVL-NEXT: retq + %b = uitofp <4 x i64> %a to <4 x float> + ret <4 x float> %b +} + +define <8 x double> @ulto8f64(<8 x i64> %a) { +; NODQ-LABEL: ulto8f64: +; NODQ: # %bb.0: +; NODQ-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1 +; NODQ-NEXT: vporq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; NODQ-NEXT: vpsrlq $32, %zmm0, %zmm0 +; NODQ-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; NODQ-NEXT: vsubpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; NODQ-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: ulto8f64: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: ulto8f64: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvtuqq2pd %zmm0, %zmm0 +; DQNOVL-NEXT: retq + %b = uitofp <8 x i64> %a to <8 x double> + ret <8 x double> %b +} + +define <16 x double> @ulto16f64(<16 x i64> %a) { +; NODQ-LABEL: ulto16f64: +; NODQ: # %bb.0: +; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm2 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295] +; NODQ-NEXT: vpandq %zmm2, %zmm0, %zmm3 +; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm4 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] +; NODQ-NEXT: vporq %zmm4, %zmm3, %zmm3 +; NODQ-NEXT: vpsrlq $32, %zmm0, %zmm0 +; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm5 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] +; NODQ-NEXT: vporq %zmm5, %zmm0, %zmm0 +; NODQ-NEXT: vbroadcastsd {{.*#+}} zmm6 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] +; NODQ-NEXT: vsubpd %zmm6, %zmm0, %zmm0 +; NODQ-NEXT: vaddpd %zmm0, %zmm3, %zmm0 +; NODQ-NEXT: vpandq %zmm2, %zmm1, %zmm2 +; NODQ-NEXT: vporq %zmm4, %zmm2, %zmm2 +; NODQ-NEXT: vpsrlq $32, %zmm1, %zmm1 +; NODQ-NEXT: vporq %zmm5, %zmm1, %zmm1 +; NODQ-NEXT: vsubpd %zmm6, %zmm1, %zmm1 +; NODQ-NEXT: vaddpd %zmm1, %zmm2, %zmm1 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: ulto16f64: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 +; VLDQ-NEXT: vcvtuqq2pd %zmm1, %zmm1 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: ulto16f64: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvtuqq2pd %zmm0, %zmm0 +; DQNOVL-NEXT: vcvtuqq2pd %zmm1, %zmm1 +; DQNOVL-NEXT: retq + %b = uitofp <16 x i64> %a to <16 x double> + ret <16 x double> %b +} + +define <16 x i32> @f64to16si(<16 x float> %a) nounwind { +; ALL-LABEL: f64to16si: +; ALL: # %bb.0: +; ALL-NEXT: vcvttps2dq %zmm0, %zmm0 +; ALL-NEXT: retq + %b = fptosi <16 x float> %a to <16 x i32> + ret <16 x i32> %b +} + +define <16 x i8> @f32to16sc(<16 x float> %f) { +; ALL-LABEL: f32to16sc: +; ALL: # %bb.0: +; ALL-NEXT: vcvttps2dq %zmm0, %zmm0 +; ALL-NEXT: vpmovdb %zmm0, %xmm0 +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %res = fptosi <16 x float> %f to <16 x i8> + ret <16 x i8> %res +} + +define <16 x i16> @f32to16ss(<16 x float> %f) { +; ALL-LABEL: f32to16ss: +; ALL: # %bb.0: +; ALL-NEXT: vcvttps2dq %zmm0, %zmm0 +; ALL-NEXT: vpmovdw %zmm0, %ymm0 +; ALL-NEXT: retq + %res = fptosi <16 x float> %f to <16 x i16> + ret <16 x i16> %res +} + +define <16 x i32> @f32to16ui(<16 x float> %a) nounwind { +; ALL-LABEL: f32to16ui: +; ALL: # %bb.0: +; ALL-NEXT: vcvttps2udq %zmm0, %zmm0 +; ALL-NEXT: retq + %b = fptoui <16 x float> %a to <16 x i32> + ret <16 x i32> %b +} + +define <16 x i8> @f32to16uc(<16 x float> %f) { +; ALL-LABEL: f32to16uc: +; ALL: # %bb.0: +; ALL-NEXT: vcvttps2dq %zmm0, %zmm0 +; ALL-NEXT: vpmovdb %zmm0, %xmm0 +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %res = fptoui <16 x float> %f to <16 x i8> + ret <16 x i8> %res +} + +define <16 x i16> @f32to16us(<16 x float> %f) { +; ALL-LABEL: f32to16us: +; ALL: # %bb.0: +; ALL-NEXT: vcvttps2dq %zmm0, %zmm0 +; ALL-NEXT: vpmovdw %zmm0, %ymm0 +; ALL-NEXT: retq + %res = fptoui <16 x float> %f to <16 x i16> + ret <16 x i16> %res +} + +define <8 x i32> @f32to8ui(<8 x float> %a) nounwind { +; NOVL-LABEL: f32to8ui: +; NOVL: # %bb.0: +; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; NOVL-NEXT: vcvttps2udq %zmm0, %zmm0 +; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; NOVL-NEXT: retq +; +; VL-LABEL: f32to8ui: +; VL: # %bb.0: +; VL-NEXT: vcvttps2udq %ymm0, %ymm0 +; VL-NEXT: retq + %b = fptoui <8 x float> %a to <8 x i32> + ret <8 x i32> %b +} + +define <4 x i32> @f32to4ui(<4 x float> %a) nounwind { +; NOVL-LABEL: f32to4ui: +; NOVL: # %bb.0: +; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; NOVL-NEXT: vcvttps2udq %zmm0, %zmm0 +; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; NOVL-NEXT: vzeroupper +; NOVL-NEXT: retq +; +; VL-LABEL: f32to4ui: +; VL: # %bb.0: +; VL-NEXT: vcvttps2udq %xmm0, %xmm0 +; VL-NEXT: retq + %b = fptoui <4 x float> %a to <4 x i32> + ret <4 x i32> %b +} + +define <8 x i32> @f64to8ui(<8 x double> %a) nounwind { +; ALL-LABEL: f64to8ui: +; ALL: # %bb.0: +; ALL-NEXT: vcvttpd2udq %zmm0, %ymm0 +; ALL-NEXT: retq + %b = fptoui <8 x double> %a to <8 x i32> + ret <8 x i32> %b +} + +define <8 x i16> @f64to8us(<8 x double> %f) { +; NOVL-LABEL: f64to8us: +; NOVL: # %bb.0: +; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; NOVL-NEXT: vpmovdw %zmm0, %ymm0 +; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; NOVL-NEXT: vzeroupper +; NOVL-NEXT: retq +; +; VL-LABEL: f64to8us: +; VL: # %bb.0: +; VL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; VL-NEXT: vpmovdw %ymm0, %xmm0 +; VL-NEXT: vzeroupper +; VL-NEXT: retq + %res = fptoui <8 x double> %f to <8 x i16> + ret <8 x i16> %res +} + +define <8 x i8> @f64to8uc(<8 x double> %f) { +; ALL-LABEL: f64to8uc: +; ALL: # %bb.0: +; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; ALL-NEXT: vcvttsd2si %xmm1, %eax +; ALL-NEXT: vcvttsd2si %xmm0, %ecx +; ALL-NEXT: vmovd %ecx, %xmm1 +; ALL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; ALL-NEXT: vextractf128 $1, %ymm0, %xmm2 +; ALL-NEXT: vcvttsd2si %xmm2, %eax +; ALL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; ALL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; ALL-NEXT: vcvttsd2si %xmm2, %eax +; ALL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; ALL-NEXT: vcvttsd2si %xmm2, %eax +; ALL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; ALL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; ALL-NEXT: vcvttsd2si %xmm2, %eax +; ALL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; ALL-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; ALL-NEXT: vcvttsd2si %xmm0, %eax +; ALL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; ALL-NEXT: vcvttsd2si %xmm0, %eax +; ALL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0 +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %res = fptoui <8 x double> %f to <8 x i8> + ret <8 x i8> %res +} + +define <4 x i32> @f64to4ui(<4 x double> %a) nounwind { +; NOVL-LABEL: f64to4ui: +; NOVL: # %bb.0: +; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; NOVL-NEXT: vcvttpd2udq %zmm0, %ymm0 +; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; NOVL-NEXT: vzeroupper +; NOVL-NEXT: retq +; +; VL-LABEL: f64to4ui: +; VL: # %bb.0: +; VL-NEXT: vcvttpd2udq %ymm0, %xmm0 +; VL-NEXT: vzeroupper +; VL-NEXT: retq + %b = fptoui <4 x double> %a to <4 x i32> + ret <4 x i32> %b +} + +define <8 x double> @sito8f64(<8 x i32> %a) { +; ALL-LABEL: sito8f64: +; ALL: # %bb.0: +; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 +; ALL-NEXT: retq + %b = sitofp <8 x i32> %a to <8 x double> + ret <8 x double> %b +} +define <8 x double> @i32to8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind { +; KNL-LABEL: i32to8f64_mask: +; KNL: # %bb.0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} +; KNL-NEXT: retq +; +; VLBW-LABEL: i32to8f64_mask: +; VLBW: # %bb.0: +; VLBW-NEXT: kmovd %edi, %k1 +; VLBW-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} +; VLBW-NEXT: retq +; +; VLNOBW-LABEL: i32to8f64_mask: +; VLNOBW: # %bb.0: +; VLNOBW-NEXT: kmovw %edi, %k1 +; VLNOBW-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} +; VLNOBW-NEXT: retq +; +; DQNOVL-LABEL: i32to8f64_mask: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: kmovw %edi, %k1 +; DQNOVL-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} +; DQNOVL-NEXT: retq +; +; AVX512BW-LABEL: i32to8f64_mask: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} +; AVX512BW-NEXT: retq + %1 = bitcast i8 %c to <8 x i1> + %2 = sitofp <8 x i32> %b to <8 x double> + %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a + ret <8 x double> %3 +} +define <8 x double> @sito8f64_maskz(<8 x i32> %a, i8 %b) nounwind { +; KNL-LABEL: sito8f64_maskz: +; KNL: # %bb.0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} +; KNL-NEXT: retq +; +; VLBW-LABEL: sito8f64_maskz: +; VLBW: # %bb.0: +; VLBW-NEXT: kmovd %edi, %k1 +; VLBW-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} +; VLBW-NEXT: retq +; +; VLNOBW-LABEL: sito8f64_maskz: +; VLNOBW: # %bb.0: +; VLNOBW-NEXT: kmovw %edi, %k1 +; VLNOBW-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} +; VLNOBW-NEXT: retq +; +; DQNOVL-LABEL: sito8f64_maskz: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: kmovw %edi, %k1 +; DQNOVL-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} +; DQNOVL-NEXT: retq +; +; AVX512BW-LABEL: sito8f64_maskz: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq + %1 = bitcast i8 %b to <8 x i1> + %2 = sitofp <8 x i32> %a to <8 x double> + %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer + ret <8 x double> %3 +} + +define <8 x i32> @f64to8si(<8 x double> %a) { +; ALL-LABEL: f64to8si: +; ALL: # %bb.0: +; ALL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; ALL-NEXT: retq + %b = fptosi <8 x double> %a to <8 x i32> + ret <8 x i32> %b +} + +define <4 x i32> @f64to4si(<4 x double> %a) { +; ALL-LABEL: f64to4si: +; ALL: # %bb.0: +; ALL-NEXT: vcvttpd2dq %ymm0, %xmm0 +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %b = fptosi <4 x double> %a to <4 x i32> + ret <4 x i32> %b +} + +define <16 x float> @f64to16f32(<16 x double> %b) nounwind { +; ALL-LABEL: f64to16f32: +; ALL: # %bb.0: +; ALL-NEXT: vcvtpd2ps %zmm0, %ymm0 +; ALL-NEXT: vcvtpd2ps %zmm1, %ymm1 +; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: retq + %a = fptrunc <16 x double> %b to <16 x float> + ret <16 x float> %a +} + +define <4 x float> @f64to4f32(<4 x double> %b) { +; ALL-LABEL: f64to4f32: +; ALL: # %bb.0: +; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0 +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %a = fptrunc <4 x double> %b to <4 x float> + ret <4 x float> %a +} + +define <4 x float> @f64to4f32_mask(<4 x double> %b, <4 x i1> %mask) { +; NOVLDQ-LABEL: f64to4f32_mask: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: vpslld $31, %xmm1, %xmm1 +; NOVLDQ-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NOVLDQ-NEXT: vcvtpd2ps %ymm0, %xmm0 +; NOVLDQ-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; NOVLDQ-NEXT: vzeroupper +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: f64to4f32_mask: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vpslld $31, %xmm1, %xmm1 +; VLDQ-NEXT: vpmovd2m %xmm1, %k1 +; VLDQ-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z} +; VLDQ-NEXT: vzeroupper +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: f64to4f32_mask: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vpslld $31, %xmm1, %xmm1 +; VLNODQ-NEXT: vptestmd %xmm1, %xmm1, %k1 +; VLNODQ-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z} +; VLNODQ-NEXT: vzeroupper +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: f64to4f32_mask: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vpslld $31, %xmm1, %xmm1 +; DQNOVL-NEXT: vpmovd2m %zmm1, %k1 +; DQNOVL-NEXT: vcvtpd2ps %ymm0, %xmm0 +; DQNOVL-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; DQNOVL-NEXT: vzeroupper +; DQNOVL-NEXT: retq + %a = fptrunc <4 x double> %b to <4 x float> + %c = select <4 x i1>%mask, <4 x float>%a, <4 x float> zeroinitializer + ret <4 x float> %c +} + +define <4 x float> @f64tof32_inreg(<2 x double> %a0, <4 x float> %a1) nounwind { +; ALL-LABEL: f64tof32_inreg: +; ALL: # %bb.0: +; ALL-NEXT: vcvtsd2ss %xmm0, %xmm1, %xmm0 +; ALL-NEXT: retq + %ext = extractelement <2 x double> %a0, i32 0 + %cvt = fptrunc double %ext to float + %res = insertelement <4 x float> %a1, float %cvt, i32 0 + ret <4 x float> %res +} + +define <8 x double> @f32to8f64(<8 x float> %b) nounwind { +; ALL-LABEL: f32to8f64: +; ALL: # %bb.0: +; ALL-NEXT: vcvtps2pd %ymm0, %zmm0 +; ALL-NEXT: retq + %a = fpext <8 x float> %b to <8 x double> + ret <8 x double> %a +} + +define <4 x double> @f32to4f64_mask(<4 x float> %b, <4 x double> %b1, <4 x double> %a1) { +; NOVL-LABEL: f32to4f64_mask: +; NOVL: # %bb.0: +; NOVL-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 +; NOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; NOVL-NEXT: vcvtps2pd %xmm0, %ymm0 +; NOVL-NEXT: vcmpltpd %zmm2, %zmm1, %k1 +; NOVL-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} +; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; NOVL-NEXT: retq +; +; VL-LABEL: f32to4f64_mask: +; VL: # %bb.0: +; VL-NEXT: vcmpltpd %ymm2, %ymm1, %k1 +; VL-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z} +; VL-NEXT: retq + %a = fpext <4 x float> %b to <4 x double> + %mask = fcmp ogt <4 x double> %a1, %b1 + %c = select <4 x i1> %mask, <4 x double> %a, <4 x double> zeroinitializer + ret <4 x double> %c +} + +define <2 x double> @f32tof64_inreg(<2 x double> %a0, <4 x float> %a1) nounwind { +; ALL-LABEL: f32tof64_inreg: +; ALL: # %bb.0: +; ALL-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0 +; ALL-NEXT: retq + %ext = extractelement <4 x float> %a1, i32 0 + %cvt = fpext float %ext to double + %res = insertelement <2 x double> %a0, double %cvt, i32 0 + ret <2 x double> %res +} + +define double @sltof64_load(i64* nocapture %e) { +; ALL-LABEL: sltof64_load: +; ALL: # %bb.0: # %entry +; ALL-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0 +; ALL-NEXT: retq +entry: + %tmp1 = load i64, i64* %e, align 8 + %conv = sitofp i64 %tmp1 to double + ret double %conv +} + +define double @sitof64_load(i32* %e) { +; ALL-LABEL: sitof64_load: +; ALL: # %bb.0: # %entry +; ALL-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0 +; ALL-NEXT: retq +entry: + %tmp1 = load i32, i32* %e, align 4 + %conv = sitofp i32 %tmp1 to double + ret double %conv +} + +define float @sitof32_load(i32* %e) { +; ALL-LABEL: sitof32_load: +; ALL: # %bb.0: # %entry +; ALL-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0 +; ALL-NEXT: retq +entry: + %tmp1 = load i32, i32* %e, align 4 + %conv = sitofp i32 %tmp1 to float + ret float %conv +} + +define float @sltof32_load(i64* %e) { +; ALL-LABEL: sltof32_load: +; ALL: # %bb.0: # %entry +; ALL-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0 +; ALL-NEXT: retq +entry: + %tmp1 = load i64, i64* %e, align 8 + %conv = sitofp i64 %tmp1 to float + ret float %conv +} + +define void @f32tof64_loadstore() { +; ALL-LABEL: f32tof64_loadstore: +; ALL: # %bb.0: # %entry +; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vmovsd %xmm0, -{{[0-9]+}}(%rsp) +; ALL-NEXT: retq +entry: + %f = alloca float, align 4 + %d = alloca double, align 8 + %tmp = load float, float* %f, align 4 + %conv = fpext float %tmp to double + store double %conv, double* %d, align 8 + ret void +} + +define void @f64tof32_loadstore() nounwind uwtable { +; ALL-LABEL: f64tof32_loadstore: +; ALL: # %bb.0: # %entry +; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp) +; ALL-NEXT: retq +entry: + %f = alloca float, align 4 + %d = alloca double, align 8 + %tmp = load double, double* %d, align 8 + %conv = fptrunc double %tmp to float + store float %conv, float* %f, align 4 + ret void +} + +define double @long_to_double(i64 %x) { +; ALL-LABEL: long_to_double: +; ALL: # %bb.0: +; ALL-NEXT: vmovq %rdi, %xmm0 +; ALL-NEXT: retq + %res = bitcast i64 %x to double + ret double %res +} + +define i64 @double_to_long(double %x) { +; ALL-LABEL: double_to_long: +; ALL: # %bb.0: +; ALL-NEXT: vmovq %xmm0, %rax +; ALL-NEXT: retq + %res = bitcast double %x to i64 + ret i64 %res +} + +define float @int_to_float(i32 %x) { +; ALL-LABEL: int_to_float: +; ALL: # %bb.0: +; ALL-NEXT: vmovd %edi, %xmm0 +; ALL-NEXT: retq + %res = bitcast i32 %x to float + ret float %res +} + +define i32 @float_to_int(float %x) { +; ALL-LABEL: float_to_int: +; ALL: # %bb.0: +; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: retq + %res = bitcast float %x to i32 + ret i32 %res +} + +define <16 x double> @uito16f64(<16 x i32> %a) nounwind { +; ALL-LABEL: uito16f64: +; ALL: # %bb.0: +; ALL-NEXT: vcvtudq2pd %ymm0, %zmm2 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vcvtudq2pd %ymm0, %zmm1 +; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: retq + %b = uitofp <16 x i32> %a to <16 x double> + ret <16 x double> %b +} + +define <8 x float> @slto8f32(<8 x i64> %a) { +; NODQ-LABEL: slto8f32: +; NODQ: # %bb.0: +; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; NODQ-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; NODQ-NEXT: vmovq %xmm1, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] +; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NODQ-NEXT: vmovq %xmm2, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: slto8f32: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtqq2ps %zmm0, %ymm0 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: slto8f32: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvtqq2ps %zmm0, %ymm0 +; DQNOVL-NEXT: retq + %b = sitofp <8 x i64> %a to <8 x float> + ret <8 x float> %b +} + +define <16 x float> @slto16f32(<16 x i64> %a) { +; NODQ-LABEL: slto16f32: +; NODQ: # %bb.0: +; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm2 +; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 +; NODQ-NEXT: vmovq %xmm2, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] +; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm3 +; NODQ-NEXT: vmovq %xmm3, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; NODQ-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; NODQ-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovq %xmm1, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4 +; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm1 +; NODQ-NEXT: vmovq %xmm1, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4 +; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; NODQ-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm1 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] +; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovq %xmm2, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] +; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm3 +; NODQ-NEXT: vmovq %xmm3, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; NODQ-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4 +; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4 +; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm0 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] +; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: slto16f32: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtqq2ps %zmm0, %ymm0 +; VLDQ-NEXT: vcvtqq2ps %zmm1, %ymm1 +; VLDQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: slto16f32: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvtqq2ps %zmm0, %ymm0 +; DQNOVL-NEXT: vcvtqq2ps %zmm1, %ymm1 +; DQNOVL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; DQNOVL-NEXT: retq + %b = sitofp <16 x i64> %a to <16 x float> + ret <16 x float> %b +} + +define <8 x double> @slto8f64(<8 x i64> %a) { +; NODQ-LABEL: slto8f64: +; NODQ: # %bb.0: +; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm1 +; NODQ-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; NODQ-NEXT: vmovq %xmm1, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3 +; NODQ-NEXT: vmovq %xmm2, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm2 +; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3 +; NODQ-NEXT: vmovq %xmm2, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm0 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: slto8f64: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtqq2pd %zmm0, %zmm0 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: slto8f64: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvtqq2pd %zmm0, %zmm0 +; DQNOVL-NEXT: retq + %b = sitofp <8 x i64> %a to <8 x double> + ret <8 x double> %b +} + +define <16 x double> @slto16f64(<16 x i64> %a) { +; NODQ-LABEL: slto16f64: +; NODQ: # %bb.0: +; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3 +; NODQ-NEXT: vmovq %xmm2, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; NODQ-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm4 +; NODQ-NEXT: vmovq %xmm3, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm3 +; NODQ-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 +; NODQ-NEXT: vmovq %xmm3, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm0 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm2 +; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovq %xmm2, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm2 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm3 +; NODQ-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 +; NODQ-NEXT: vmovq %xmm3, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm3 +; NODQ-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 +; NODQ-NEXT: vmovq %xmm3, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; NODQ-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 +; NODQ-NEXT: vmovq %xmm1, %rax +; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm1 +; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: slto16f64: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtqq2pd %zmm0, %zmm0 +; VLDQ-NEXT: vcvtqq2pd %zmm1, %zmm1 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: slto16f64: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvtqq2pd %zmm0, %zmm0 +; DQNOVL-NEXT: vcvtqq2pd %zmm1, %zmm1 +; DQNOVL-NEXT: retq + %b = sitofp <16 x i64> %a to <16 x double> + ret <16 x double> %b +} + +define <8 x float> @ulto8f32(<8 x i64> %a) { +; NODQ-LABEL: ulto8f32: +; NODQ: # %bb.0: +; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; NODQ-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 +; NODQ-NEXT: vmovq %xmm1, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] +; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NODQ-NEXT: vmovq %xmm2, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm3 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm0 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: ulto8f32: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: ulto8f32: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvtuqq2ps %zmm0, %ymm0 +; DQNOVL-NEXT: retq + %b = uitofp <8 x i64> %a to <8 x float> + ret <8 x float> %b +} + +define <16 x float> @ulto16f32(<16 x i64> %a) { +; NODQ-LABEL: ulto16f32: +; NODQ: # %bb.0: +; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm2 +; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm3 +; NODQ-NEXT: vmovq %xmm2, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] +; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm3 +; NODQ-NEXT: vmovq %xmm3, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm4 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; NODQ-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; NODQ-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovq %xmm1, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4 +; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm1 +; NODQ-NEXT: vmovq %xmm1, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4 +; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; NODQ-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm1 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] +; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovq %xmm2, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] +; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm3 +; NODQ-NEXT: vmovq %xmm3, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; NODQ-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4 +; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4 +; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm0 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] +; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: ulto16f32: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 +; VLDQ-NEXT: vcvtuqq2ps %zmm1, %ymm1 +; VLDQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: ulto16f32: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvtuqq2ps %zmm0, %ymm0 +; DQNOVL-NEXT: vcvtuqq2ps %zmm1, %ymm1 +; DQNOVL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; DQNOVL-NEXT: retq + %b = uitofp <16 x i64> %a to <16 x float> + ret <16 x float> %b +} + +define <8 x double> @uito8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind { +; KNL-LABEL: uito8f64_mask: +; KNL: # %bb.0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} +; KNL-NEXT: retq +; +; VLBW-LABEL: uito8f64_mask: +; VLBW: # %bb.0: +; VLBW-NEXT: kmovd %edi, %k1 +; VLBW-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} +; VLBW-NEXT: retq +; +; VLNOBW-LABEL: uito8f64_mask: +; VLNOBW: # %bb.0: +; VLNOBW-NEXT: kmovw %edi, %k1 +; VLNOBW-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} +; VLNOBW-NEXT: retq +; +; DQNOVL-LABEL: uito8f64_mask: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: kmovw %edi, %k1 +; DQNOVL-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} +; DQNOVL-NEXT: retq +; +; AVX512BW-LABEL: uito8f64_mask: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} +; AVX512BW-NEXT: retq + %1 = bitcast i8 %c to <8 x i1> + %2 = uitofp <8 x i32> %b to <8 x double> + %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a + ret <8 x double> %3 +} +define <8 x double> @uito8f64_maskz(<8 x i32> %a, i8 %b) nounwind { +; KNL-LABEL: uito8f64_maskz: +; KNL: # %bb.0: +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} +; KNL-NEXT: retq +; +; VLBW-LABEL: uito8f64_maskz: +; VLBW: # %bb.0: +; VLBW-NEXT: kmovd %edi, %k1 +; VLBW-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} +; VLBW-NEXT: retq +; +; VLNOBW-LABEL: uito8f64_maskz: +; VLNOBW: # %bb.0: +; VLNOBW-NEXT: kmovw %edi, %k1 +; VLNOBW-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} +; VLNOBW-NEXT: retq +; +; DQNOVL-LABEL: uito8f64_maskz: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: kmovw %edi, %k1 +; DQNOVL-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} +; DQNOVL-NEXT: retq +; +; AVX512BW-LABEL: uito8f64_maskz: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq + %1 = bitcast i8 %b to <8 x i1> + %2 = uitofp <8 x i32> %a to <8 x double> + %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer + ret <8 x double> %3 +} + +define <4 x double> @uito4f64(<4 x i32> %a) nounwind { +; NOVL-LABEL: uito4f64: +; NOVL: # %bb.0: +; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; NOVL-NEXT: vcvtudq2pd %ymm0, %zmm0 +; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; NOVL-NEXT: retq +; +; VL-LABEL: uito4f64: +; VL: # %bb.0: +; VL-NEXT: vcvtudq2pd %xmm0, %ymm0 +; VL-NEXT: retq + %b = uitofp <4 x i32> %a to <4 x double> + ret <4 x double> %b +} + +define <16 x float> @uito16f32(<16 x i32> %a) nounwind { +; ALL-LABEL: uito16f32: +; ALL: # %bb.0: +; ALL-NEXT: vcvtudq2ps %zmm0, %zmm0 +; ALL-NEXT: retq + %b = uitofp <16 x i32> %a to <16 x float> + ret <16 x float> %b +} + +define <8 x double> @uito8f64(<8 x i32> %a) { +; ALL-LABEL: uito8f64: +; ALL: # %bb.0: +; ALL-NEXT: vcvtudq2pd %ymm0, %zmm0 +; ALL-NEXT: retq + %b = uitofp <8 x i32> %a to <8 x double> + ret <8 x double> %b +} + +define <8 x float> @uito8f32(<8 x i32> %a) nounwind { +; NOVL-LABEL: uito8f32: +; NOVL: # %bb.0: +; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; NOVL-NEXT: vcvtudq2ps %zmm0, %zmm0 +; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; NOVL-NEXT: retq +; +; VL-LABEL: uito8f32: +; VL: # %bb.0: +; VL-NEXT: vcvtudq2ps %ymm0, %ymm0 +; VL-NEXT: retq + %b = uitofp <8 x i32> %a to <8 x float> + ret <8 x float> %b +} + +define <4 x float> @uito4f32(<4 x i32> %a) nounwind { +; NOVL-LABEL: uito4f32: +; NOVL: # %bb.0: +; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; NOVL-NEXT: vcvtudq2ps %zmm0, %zmm0 +; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; NOVL-NEXT: vzeroupper +; NOVL-NEXT: retq +; +; VL-LABEL: uito4f32: +; VL: # %bb.0: +; VL-NEXT: vcvtudq2ps %xmm0, %xmm0 +; VL-NEXT: retq + %b = uitofp <4 x i32> %a to <4 x float> + ret <4 x float> %b +} + +define i32 @fptosi(float %a) nounwind { +; ALL-LABEL: fptosi: +; ALL: # %bb.0: +; ALL-NEXT: vcvttss2si %xmm0, %eax +; ALL-NEXT: retq + %b = fptosi float %a to i32 + ret i32 %b +} + +define i32 @fptoui(float %a) nounwind { +; ALL-LABEL: fptoui: +; ALL: # %bb.0: +; ALL-NEXT: vcvttss2usi %xmm0, %eax +; ALL-NEXT: retq + %b = fptoui float %a to i32 + ret i32 %b +} + +define float @uitof32(i32 %a) nounwind { +; ALL-LABEL: uitof32: +; ALL: # %bb.0: +; ALL-NEXT: vcvtusi2ssl %edi, %xmm0, %xmm0 +; ALL-NEXT: retq + %b = uitofp i32 %a to float + ret float %b +} + +define double @uitof64(i32 %a) nounwind { +; ALL-LABEL: uitof64: +; ALL: # %bb.0: +; ALL-NEXT: vcvtusi2sdl %edi, %xmm0, %xmm0 +; ALL-NEXT: retq + %b = uitofp i32 %a to double + ret double %b +} + +define <16 x float> @sbto16f32(<16 x i32> %a) { +; NODQ-LABEL: sbto16f32: +; NODQ: # %bb.0: +; NODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NODQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 +; NODQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NODQ-NEXT: vcvtdq2ps %zmm0, %zmm0 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: sbto16f32: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vpmovd2m %zmm0, %k0 +; VLDQ-NEXT: vpmovm2d %k0, %zmm0 +; VLDQ-NEXT: vcvtdq2ps %zmm0, %zmm0 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: sbto16f32: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vpmovd2m %zmm0, %k0 +; DQNOVL-NEXT: vpmovm2d %k0, %zmm0 +; DQNOVL-NEXT: vcvtdq2ps %zmm0, %zmm0 +; DQNOVL-NEXT: retq + %mask = icmp slt <16 x i32> %a, zeroinitializer + %1 = sitofp <16 x i1> %mask to <16 x float> + ret <16 x float> %1 +} + +define <16 x float> @scto16f32(<16 x i8> %a) { +; ALL-LABEL: scto16f32: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxbd %xmm0, %zmm0 +; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 +; ALL-NEXT: retq + %1 = sitofp <16 x i8> %a to <16 x float> + ret <16 x float> %1 +} + +define <16 x float> @ssto16f32(<16 x i16> %a) { +; ALL-LABEL: ssto16f32: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxwd %ymm0, %zmm0 +; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 +; ALL-NEXT: retq + %1 = sitofp <16 x i16> %a to <16 x float> + ret <16 x float> %1 +} + +define <8 x double> @ssto16f64(<8 x i16> %a) { +; ALL-LABEL: ssto16f64: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxwd %xmm0, %ymm0 +; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 +; ALL-NEXT: retq + %1 = sitofp <8 x i16> %a to <8 x double> + ret <8 x double> %1 +} + +define <8 x double> @scto8f64(<8 x i8> %a) { +; ALL-LABEL: scto8f64: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxbd %xmm0, %ymm0 +; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 +; ALL-NEXT: retq + %1 = sitofp <8 x i8> %a to <8 x double> + ret <8 x double> %1 +} + +define <16 x double> @scto16f64(<16 x i8> %a) { +; ALL-LABEL: scto16f64: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxbd %xmm0, %zmm1 +; ALL-NEXT: vcvtdq2pd %ymm1, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1 +; ALL-NEXT: retq + %b = sitofp <16 x i8> %a to <16 x double> + ret <16 x double> %b +} + +define <16 x double> @sbto16f64(<16 x double> %a) { +; NODQ-LABEL: sbto16f64: +; NODQ: # %bb.0: +; NODQ-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; NODQ-NEXT: vcmpltpd %zmm0, %zmm2, %k0 +; NODQ-NEXT: vcmpltpd %zmm1, %zmm2, %k1 +; NODQ-NEXT: kunpckbw %k0, %k1, %k1 +; NODQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NODQ-NEXT: vcvtdq2pd %ymm1, %zmm0 +; NODQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; NODQ-NEXT: vcvtdq2pd %ymm1, %zmm1 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: sbto16f64: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; VLDQ-NEXT: vcmpltpd %zmm0, %zmm2, %k0 +; VLDQ-NEXT: vcmpltpd %zmm1, %zmm2, %k1 +; VLDQ-NEXT: kunpckbw %k0, %k1, %k0 +; VLDQ-NEXT: vpmovm2d %k0, %zmm1 +; VLDQ-NEXT: vcvtdq2pd %ymm1, %zmm0 +; VLDQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; VLDQ-NEXT: vcvtdq2pd %ymm1, %zmm1 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: sbto16f64: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; DQNOVL-NEXT: vcmpltpd %zmm0, %zmm2, %k0 +; DQNOVL-NEXT: vcmpltpd %zmm1, %zmm2, %k1 +; DQNOVL-NEXT: kunpckbw %k0, %k1, %k0 +; DQNOVL-NEXT: vpmovm2d %k0, %zmm1 +; DQNOVL-NEXT: vcvtdq2pd %ymm1, %zmm0 +; DQNOVL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; DQNOVL-NEXT: vcvtdq2pd %ymm1, %zmm1 +; DQNOVL-NEXT: retq + %cmpres = fcmp ogt <16 x double> %a, zeroinitializer + %1 = sitofp <16 x i1> %cmpres to <16 x double> + ret <16 x double> %1 +} + +define <8 x double> @sbto8f64(<8 x double> %a) { +; NOVLDQ-LABEL: sbto8f64: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; NOVLDQ-NEXT: vcmpltpd %zmm0, %zmm1, %k1 +; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0 +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: sbto8f64: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; VLDQ-NEXT: vcmpltpd %zmm0, %zmm1, %k0 +; VLDQ-NEXT: vpmovm2d %k0, %ymm0 +; VLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0 +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: sbto8f64: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; VLNODQ-NEXT: vcmpltpd %zmm0, %zmm1, %k1 +; VLNODQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; VLNODQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; VLNODQ-NEXT: vcvtdq2pd %ymm0, %zmm0 +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: sbto8f64: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; DQNOVL-NEXT: vcmpltpd %zmm0, %zmm1, %k0 +; DQNOVL-NEXT: vpmovm2d %k0, %zmm0 +; DQNOVL-NEXT: vcvtdq2pd %ymm0, %zmm0 +; DQNOVL-NEXT: retq + %cmpres = fcmp ogt <8 x double> %a, zeroinitializer + %1 = sitofp <8 x i1> %cmpres to <8 x double> + ret <8 x double> %1 +} + +define <8 x float> @sbto8f32(<8 x float> %a) { +; ALL-LABEL: sbto8f32: +; ALL: # %bb.0: +; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 +; ALL-NEXT: vcvtdq2ps %ymm0, %ymm0 +; ALL-NEXT: retq + %cmpres = fcmp ogt <8 x float> %a, zeroinitializer + %1 = sitofp <8 x i1> %cmpres to <8 x float> + ret <8 x float> %1 +} + +define <4 x float> @sbto4f32(<4 x float> %a) { +; ALL-LABEL: sbto4f32: +; ALL: # %bb.0: +; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 +; ALL-NEXT: vcvtdq2ps %xmm0, %xmm0 +; ALL-NEXT: retq + %cmpres = fcmp ogt <4 x float> %a, zeroinitializer + %1 = sitofp <4 x i1> %cmpres to <4 x float> + ret <4 x float> %1 +} + +define <4 x double> @sbto4f64(<4 x double> %a) { +; NOVL-LABEL: sbto4f64: +; NOVL: # %bb.0: +; NOVL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; NOVL-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 +; NOVL-NEXT: vpmovqd %zmm0, %ymm0 +; NOVL-NEXT: vcvtdq2pd %xmm0, %ymm0 +; NOVL-NEXT: retq +; +; VLDQ-LABEL: sbto4f64: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; VLDQ-NEXT: vcmpltpd %ymm0, %ymm1, %k0 +; VLDQ-NEXT: vpmovm2d %k0, %xmm0 +; VLDQ-NEXT: vcvtdq2pd %xmm0, %ymm0 +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: sbto4f64: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; VLNODQ-NEXT: vcmpltpd %ymm0, %ymm1, %k1 +; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; VLNODQ-NEXT: vcvtdq2pd %xmm0, %ymm0 +; VLNODQ-NEXT: retq + %cmpres = fcmp ogt <4 x double> %a, zeroinitializer + %1 = sitofp <4 x i1> %cmpres to <4 x double> + ret <4 x double> %1 +} + +define <2 x float> @sbto2f32(<2 x float> %a) { +; ALL-LABEL: sbto2f32: +; ALL: # %bb.0: +; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 +; ALL-NEXT: vcvtdq2ps %xmm0, %xmm0 +; ALL-NEXT: retq + %cmpres = fcmp ogt <2 x float> %a, zeroinitializer + %1 = sitofp <2 x i1> %cmpres to <2 x float> + ret <2 x float> %1 +} + +define <2 x double> @sbto2f64(<2 x double> %a) { +; NOVL-LABEL: sbto2f64: +; NOVL: # %bb.0: +; NOVL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; NOVL-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 +; NOVL-NEXT: vpmovqd %zmm0, %ymm0 +; NOVL-NEXT: vcvtdq2pd %xmm0, %xmm0 +; NOVL-NEXT: vzeroupper +; NOVL-NEXT: retq +; +; VLDQ-LABEL: sbto2f64: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; VLDQ-NEXT: vcmpltpd %xmm0, %xmm1, %k0 +; VLDQ-NEXT: vpmovm2d %k0, %xmm0 +; VLDQ-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: sbto2f64: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; VLNODQ-NEXT: vcmpltpd %xmm0, %xmm1, %k1 +; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; VLNODQ-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VLNODQ-NEXT: retq + %cmpres = fcmp ogt <2 x double> %a, zeroinitializer + %1 = sitofp <2 x i1> %cmpres to <2 x double> + ret <2 x double> %1 +} + +define <16 x float> @ucto16f32(<16 x i8> %a) { +; ALL-LABEL: ucto16f32: +; ALL: # %bb.0: +; ALL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 +; ALL-NEXT: retq + %b = uitofp <16 x i8> %a to <16 x float> + ret <16 x float>%b +} + +define <8 x double> @ucto8f64(<8 x i8> %a) { +; ALL-LABEL: ucto8f64: +; ALL: # %bb.0: +; ALL-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 +; ALL-NEXT: retq + %b = uitofp <8 x i8> %a to <8 x double> + ret <8 x double> %b +} + +define <16 x float> @swto16f32(<16 x i16> %a) { +; ALL-LABEL: swto16f32: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxwd %ymm0, %zmm0 +; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 +; ALL-NEXT: retq + %b = sitofp <16 x i16> %a to <16 x float> + ret <16 x float> %b +} + +define <8 x double> @swto8f64(<8 x i16> %a) { +; ALL-LABEL: swto8f64: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxwd %xmm0, %ymm0 +; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 +; ALL-NEXT: retq + %b = sitofp <8 x i16> %a to <8 x double> + ret <8 x double> %b +} + +define <16 x double> @swto16f64(<16 x i16> %a) { +; ALL-LABEL: swto16f64: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxwd %ymm0, %zmm1 +; ALL-NEXT: vcvtdq2pd %ymm1, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1 +; ALL-NEXT: retq + %b = sitofp <16 x i16> %a to <16 x double> + ret <16 x double> %b +} + +define <16 x double> @ucto16f64(<16 x i8> %a) { +; ALL-LABEL: ucto16f64: +; ALL: # %bb.0: +; ALL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; ALL-NEXT: vcvtdq2pd %ymm1, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1 +; ALL-NEXT: retq + %b = uitofp <16 x i8> %a to <16 x double> + ret <16 x double> %b +} + +define <16 x float> @uwto16f32(<16 x i16> %a) { +; ALL-LABEL: uwto16f32: +; ALL: # %bb.0: +; ALL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 +; ALL-NEXT: retq + %b = uitofp <16 x i16> %a to <16 x float> + ret <16 x float> %b +} + +define <8 x double> @uwto8f64(<8 x i16> %a) { +; ALL-LABEL: uwto8f64: +; ALL: # %bb.0: +; ALL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 +; ALL-NEXT: retq + %b = uitofp <8 x i16> %a to <8 x double> + ret <8 x double> %b +} + +define <16 x double> @uwto16f64(<16 x i16> %a) { +; ALL-LABEL: uwto16f64: +; ALL: # %bb.0: +; ALL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; ALL-NEXT: vcvtdq2pd %ymm1, %zmm0 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1 +; ALL-NEXT: retq + %b = uitofp <16 x i16> %a to <16 x double> + ret <16 x double> %b +} + +define <16 x float> @sito16f32(<16 x i32> %a) { +; ALL-LABEL: sito16f32: +; ALL: # %bb.0: +; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 +; ALL-NEXT: retq + %b = sitofp <16 x i32> %a to <16 x float> + ret <16 x float> %b +} + +define <16 x double> @sito16f64(<16 x i32> %a) { +; ALL-LABEL: sito16f64: +; ALL: # %bb.0: +; ALL-NEXT: vcvtdq2pd %ymm0, %zmm2 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vcvtdq2pd %ymm0, %zmm1 +; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: retq + %b = sitofp <16 x i32> %a to <16 x double> + ret <16 x double> %b +} + +define <16 x float> @usto16f32(<16 x i16> %a) { +; ALL-LABEL: usto16f32: +; ALL: # %bb.0: +; ALL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 +; ALL-NEXT: retq + %b = uitofp <16 x i16> %a to <16 x float> + ret <16 x float> %b +} + +define <16 x float> @ubto16f32(<16 x i32> %a) { +; NODQ-LABEL: ubto16f32: +; NODQ: # %bb.0: +; NODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NODQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 +; NODQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NODQ-NEXT: vpsrld $31, %zmm0, %zmm0 +; NODQ-NEXT: vcvtdq2ps %zmm0, %zmm0 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: ubto16f32: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vpmovd2m %zmm0, %k0 +; VLDQ-NEXT: vpmovm2d %k0, %zmm0 +; VLDQ-NEXT: vpsrld $31, %zmm0, %zmm0 +; VLDQ-NEXT: vcvtdq2ps %zmm0, %zmm0 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: ubto16f32: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vpmovd2m %zmm0, %k0 +; DQNOVL-NEXT: vpmovm2d %k0, %zmm0 +; DQNOVL-NEXT: vpsrld $31, %zmm0, %zmm0 +; DQNOVL-NEXT: vcvtdq2ps %zmm0, %zmm0 +; DQNOVL-NEXT: retq + %mask = icmp slt <16 x i32> %a, zeroinitializer + %1 = uitofp <16 x i1> %mask to <16 x float> + ret <16 x float> %1 +} + +define <16 x double> @ubto16f64(<16 x i32> %a) { +; NODQ-LABEL: ubto16f64: +; NODQ: # %bb.0: +; NODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NODQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 +; NODQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NODQ-NEXT: vpsrld $31, %zmm0, %zmm1 +; NODQ-NEXT: vcvtdq2pd %ymm1, %zmm0 +; NODQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; NODQ-NEXT: vcvtdq2pd %ymm1, %zmm1 +; NODQ-NEXT: retq +; +; VLDQ-LABEL: ubto16f64: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vpmovd2m %zmm0, %k0 +; VLDQ-NEXT: vpmovm2d %k0, %zmm0 +; VLDQ-NEXT: vpsrld $31, %zmm0, %zmm1 +; VLDQ-NEXT: vcvtdq2pd %ymm1, %zmm0 +; VLDQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; VLDQ-NEXT: vcvtdq2pd %ymm1, %zmm1 +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: ubto16f64: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vpmovd2m %zmm0, %k0 +; DQNOVL-NEXT: vpmovm2d %k0, %zmm0 +; DQNOVL-NEXT: vpsrld $31, %zmm0, %zmm1 +; DQNOVL-NEXT: vcvtdq2pd %ymm1, %zmm0 +; DQNOVL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; DQNOVL-NEXT: vcvtdq2pd %ymm1, %zmm1 +; DQNOVL-NEXT: retq + %mask = icmp slt <16 x i32> %a, zeroinitializer + %1 = uitofp <16 x i1> %mask to <16 x double> + ret <16 x double> %1 +} + +define <8 x float> @ubto8f32(<8 x i32> %a) { +; NOVL-LABEL: ubto8f32: +; NOVL: # %bb.0: +; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NOVL-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 +; NOVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1065353216,1065353216,1065353216,1065353216,1065353216,1065353216,1065353216,1065353216] +; NOVL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; NOVL-NEXT: retq +; +; VL-LABEL: ubto8f32: +; VL: # %bb.0: +; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VL-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 +; VL-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm0, %ymm0 +; VL-NEXT: retq + %mask = icmp slt <8 x i32> %a, zeroinitializer + %1 = uitofp <8 x i1> %mask to <8 x float> + ret <8 x float> %1 +} + +define <8 x double> @ubto8f64(<8 x i32> %a) { +; ALL-LABEL: ubto8f64: +; ALL: # %bb.0: +; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 +; ALL-NEXT: vpsrld $31, %ymm0, %ymm0 +; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 +; ALL-NEXT: retq + %mask = icmp slt <8 x i32> %a, zeroinitializer + %1 = uitofp <8 x i1> %mask to <8 x double> + ret <8 x double> %1 +} + +define <4 x float> @ubto4f32(<4 x i32> %a) { +; NOVL-LABEL: ubto4f32: +; NOVL: # %bb.0: +; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NOVL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NOVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216] +; NOVL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NOVL-NEXT: retq +; +; VL-LABEL: ubto4f32: +; VL: # %bb.0: +; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; VL-NEXT: retq + %mask = icmp slt <4 x i32> %a, zeroinitializer + %1 = uitofp <4 x i1> %mask to <4 x float> + ret <4 x float> %1 +} + +define <4 x double> @ubto4f64(<4 x i32> %a) { +; ALL-LABEL: ubto4f64: +; ALL: # %bb.0: +; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; ALL-NEXT: vpsrld $31, %xmm0, %xmm0 +; ALL-NEXT: vcvtdq2pd %xmm0, %ymm0 +; ALL-NEXT: retq + %mask = icmp slt <4 x i32> %a, zeroinitializer + %1 = uitofp <4 x i1> %mask to <4 x double> + ret <4 x double> %1 +} + +define <2 x float> @ubto2f32(<2 x i32> %a) { +; NOVL-LABEL: ubto2f32: +; NOVL: # %bb.0: +; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NOVL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NOVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216] +; NOVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NOVL-NEXT: retq +; +; VL-LABEL: ubto2f32: +; VL: # %bb.0: +; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; VL-NEXT: retq + %mask = icmp ne <2 x i32> %a, zeroinitializer + %1 = uitofp <2 x i1> %mask to <2 x float> + ret <2 x float> %1 +} + +define <2 x double> @ubto2f64(<2 x i32> %a) { +; NOVL-LABEL: ubto2f64: +; NOVL: # %bb.0: +; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NOVL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NOVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; NOVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NOVL-NEXT: vcvtdq2pd %xmm0, %xmm0 +; NOVL-NEXT: retq +; +; VL-LABEL: ubto2f64: +; VL: # %bb.0: +; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; VL-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VL-NEXT: retq + %mask = icmp ne <2 x i32> %a, zeroinitializer + %1 = uitofp <2 x i1> %mask to <2 x double> + ret <2 x double> %1 +} + +define <2 x i64> @test_2f64toub(<2 x double> %a, <2 x i64> %passthru) { +; NOVLDQ-LABEL: test_2f64toub: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; NOVLDQ-NEXT: vcvttpd2udq %zmm0, %ymm0 +; NOVLDQ-NEXT: vpslld $31, %ymm0, %ymm0 +; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; NOVLDQ-NEXT: vzeroupper +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: test_2f64toub: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttpd2udq %xmm0, %xmm0 +; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0 +; VLDQ-NEXT: vpmovd2m %xmm0, %k1 +; VLDQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: test_2f64toub: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vcvttpd2udq %xmm0, %xmm0 +; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0 +; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VLNODQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: test_2f64toub: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; DQNOVL-NEXT: vcvttpd2udq %zmm0, %ymm0 +; DQNOVL-NEXT: vpslld $31, %ymm0, %ymm0 +; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 +; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; DQNOVL-NEXT: vzeroupper +; DQNOVL-NEXT: retq + %mask = fptoui <2 x double> %a to <2 x i1> + %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer + ret <2 x i64> %select +} + +define <4 x i64> @test_4f64toub(<4 x double> %a, <4 x i64> %passthru) { +; NOVLDQ-LABEL: test_4f64toub: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; NOVLDQ-NEXT: vcvttpd2dq %ymm0, %xmm0 +; NOVLDQ-NEXT: vpslld $31, %xmm0, %xmm0 +; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: test_4f64toub: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttpd2dq %ymm0, %xmm0 +; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0 +; VLDQ-NEXT: vpmovd2m %xmm0, %k1 +; VLDQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: test_4f64toub: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vcvttpd2dq %ymm0, %xmm0 +; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0 +; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VLNODQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: test_4f64toub: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; DQNOVL-NEXT: vcvttpd2dq %ymm0, %xmm0 +; DQNOVL-NEXT: vpslld $31, %xmm0, %xmm0 +; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 +; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; DQNOVL-NEXT: retq + %mask = fptoui <4 x double> %a to <4 x i1> + %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer + ret <4 x i64> %select +} + +define <8 x i64> @test_8f64toub(<8 x double> %a, <8 x i64> %passthru) { +; NOVLDQ-LABEL: test_8f64toub: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; NOVLDQ-NEXT: vpslld $31, %ymm0, %ymm0 +; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: test_8f64toub: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; VLDQ-NEXT: vpslld $31, %ymm0, %ymm0 +; VLDQ-NEXT: vpmovd2m %ymm0, %k1 +; VLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: test_8f64toub: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; VLNODQ-NEXT: vpslld $31, %ymm0, %ymm0 +; VLNODQ-NEXT: vptestmd %ymm0, %ymm0, %k1 +; VLNODQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: test_8f64toub: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; DQNOVL-NEXT: vpslld $31, %ymm0, %ymm0 +; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 +; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; DQNOVL-NEXT: retq + %mask = fptoui <8 x double> %a to <8 x i1> + %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer + ret <8 x i64> %select +} + +define <2 x i64> @test_2f32toub(<2 x float> %a, <2 x i64> %passthru) { +; NOVLDQ-LABEL: test_2f32toub: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; NOVLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; NOVLDQ-NEXT: vpslld $31, %xmm0, %xmm0 +; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; NOVLDQ-NEXT: vzeroupper +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: test_2f32toub: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0 +; VLDQ-NEXT: vpmovd2m %xmm0, %k1 +; VLDQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: test_2f32toub: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0 +; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VLNODQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: test_2f32toub: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; DQNOVL-NEXT: vcvttps2dq %xmm0, %xmm0 +; DQNOVL-NEXT: vpslld $31, %xmm0, %xmm0 +; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 +; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; DQNOVL-NEXT: vzeroupper +; DQNOVL-NEXT: retq + %mask = fptoui <2 x float> %a to <2 x i1> + %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer + ret <2 x i64> %select +} + +define <4 x i64> @test_4f32toub(<4 x float> %a, <4 x i64> %passthru) { +; NOVLDQ-LABEL: test_4f32toub: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; NOVLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; NOVLDQ-NEXT: vpslld $31, %xmm0, %xmm0 +; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: test_4f32toub: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0 +; VLDQ-NEXT: vpmovd2m %xmm0, %k1 +; VLDQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: test_4f32toub: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0 +; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VLNODQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: test_4f32toub: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; DQNOVL-NEXT: vcvttps2dq %xmm0, %xmm0 +; DQNOVL-NEXT: vpslld $31, %xmm0, %xmm0 +; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 +; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; DQNOVL-NEXT: retq + %mask = fptoui <4 x float> %a to <4 x i1> + %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer + ret <4 x i64> %select +} + +define <8 x i64> @test_8f32toub(<8 x float> %a, <8 x i64> %passthru) { +; NOVLDQ-LABEL: test_8f32toub: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; NOVLDQ-NEXT: vpslld $31, %ymm0, %ymm0 +; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: test_8f32toub: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; VLDQ-NEXT: vpslld $31, %ymm0, %ymm0 +; VLDQ-NEXT: vpmovd2m %ymm0, %k1 +; VLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: test_8f32toub: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; VLNODQ-NEXT: vpslld $31, %ymm0, %ymm0 +; VLNODQ-NEXT: vptestmd %ymm0, %ymm0, %k1 +; VLNODQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: test_8f32toub: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvttps2dq %ymm0, %ymm0 +; DQNOVL-NEXT: vpslld $31, %ymm0, %ymm0 +; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 +; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; DQNOVL-NEXT: retq + %mask = fptoui <8 x float> %a to <8 x i1> + %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer + ret <8 x i64> %select +} + +define <16 x i32> @test_16f32toub(<16 x float> %a, <16 x i32> %passthru) { +; NODQ-LABEL: test_16f32toub: +; NODQ: # %bb.0: +; NODQ-NEXT: vcvttps2dq %zmm0, %zmm0 +; NODQ-NEXT: vpslld $31, %zmm0, %zmm0 +; NODQ-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NODQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} +; NODQ-NEXT: retq +; +; VLDQ-LABEL: test_16f32toub: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttps2dq %zmm0, %zmm0 +; VLDQ-NEXT: vpslld $31, %zmm0, %zmm0 +; VLDQ-NEXT: vpmovd2m %zmm0, %k1 +; VLDQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: test_16f32toub: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvttps2dq %zmm0, %zmm0 +; DQNOVL-NEXT: vpslld $31, %zmm0, %zmm0 +; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 +; DQNOVL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} +; DQNOVL-NEXT: retq + %mask = fptoui <16 x float> %a to <16 x i1> + %select = select <16 x i1> %mask, <16 x i32> %passthru, <16 x i32> zeroinitializer + ret <16 x i32> %select +} + +define <2 x i64> @test_2f64tosb(<2 x double> %a, <2 x i64> %passthru) { +; NOVLDQ-LABEL: test_2f64tosb: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; NOVLDQ-NEXT: vcvttpd2dq %xmm0, %xmm0 +; NOVLDQ-NEXT: vpslld $31, %xmm0, %xmm0 +; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; NOVLDQ-NEXT: vzeroupper +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: test_2f64tosb: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttpd2dq %xmm0, %xmm0 +; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0 +; VLDQ-NEXT: vpmovd2m %xmm0, %k1 +; VLDQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: test_2f64tosb: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vcvttpd2dq %xmm0, %xmm0 +; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0 +; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VLNODQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: test_2f64tosb: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; DQNOVL-NEXT: vcvttpd2dq %xmm0, %xmm0 +; DQNOVL-NEXT: vpslld $31, %xmm0, %xmm0 +; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 +; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; DQNOVL-NEXT: vzeroupper +; DQNOVL-NEXT: retq + %mask = fptosi <2 x double> %a to <2 x i1> + %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer + ret <2 x i64> %select +} + +define <4 x i64> @test_4f64tosb(<4 x double> %a, <4 x i64> %passthru) { +; NOVLDQ-LABEL: test_4f64tosb: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; NOVLDQ-NEXT: vcvttpd2dq %ymm0, %xmm0 +; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: test_4f64tosb: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttpd2dq %ymm0, %xmm0 +; VLDQ-NEXT: vpmovd2m %xmm0, %k1 +; VLDQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: test_4f64tosb: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vcvttpd2dq %ymm0, %xmm0 +; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VLNODQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: test_4f64tosb: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; DQNOVL-NEXT: vcvttpd2dq %ymm0, %xmm0 +; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 +; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; DQNOVL-NEXT: retq + %mask = fptosi <4 x double> %a to <4 x i1> + %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer + ret <4 x i64> %select +} + +define <8 x i64> @test_8f64tosb(<8 x double> %a, <8 x i64> %passthru) { +; NOVLDQ-LABEL: test_8f64tosb: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: test_8f64tosb: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; VLDQ-NEXT: vpmovd2m %ymm0, %k1 +; VLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: test_8f64tosb: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; VLNODQ-NEXT: vptestmd %ymm0, %ymm0, %k1 +; VLNODQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: test_8f64tosb: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 +; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; DQNOVL-NEXT: retq + %mask = fptosi <8 x double> %a to <8 x i1> + %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer + ret <8 x i64> %select +} + +define <2 x i64> @test_2f32tosb(<2 x float> %a, <2 x i64> %passthru) { +; NOVLDQ-LABEL: test_2f32tosb: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; NOVLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; NOVLDQ-NEXT: vzeroupper +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: test_2f32tosb: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; VLDQ-NEXT: vpmovd2m %xmm0, %k1 +; VLDQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: test_2f32tosb: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VLNODQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: test_2f32tosb: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; DQNOVL-NEXT: vcvttps2dq %xmm0, %xmm0 +; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 +; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; DQNOVL-NEXT: vzeroupper +; DQNOVL-NEXT: retq + %mask = fptosi <2 x float> %a to <2 x i1> + %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer + ret <2 x i64> %select +} + +define <4 x i64> @test_4f32tosb(<4 x float> %a, <4 x i64> %passthru) { +; NOVLDQ-LABEL: test_4f32tosb: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; NOVLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: test_4f32tosb: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; VLDQ-NEXT: vpmovd2m %xmm0, %k1 +; VLDQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: test_4f32tosb: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 +; VLNODQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: test_4f32tosb: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; DQNOVL-NEXT: vcvttps2dq %xmm0, %xmm0 +; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 +; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; DQNOVL-NEXT: retq + %mask = fptosi <4 x float> %a to <4 x i1> + %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer + ret <4 x i64> %select +} + +define <8 x i64> @test_8f32tosb(<8 x float> %a, <8 x i64> %passthru) { +; NOVLDQ-LABEL: test_8f32tosb: +; NOVLDQ: # %bb.0: +; NOVLDQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: retq +; +; VLDQ-LABEL: test_8f32tosb: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; VLDQ-NEXT: vpmovd2m %ymm0, %k1 +; VLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; VLDQ-NEXT: retq +; +; VLNODQ-LABEL: test_8f32tosb: +; VLNODQ: # %bb.0: +; VLNODQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; VLNODQ-NEXT: vptestmd %ymm0, %ymm0, %k1 +; VLNODQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; VLNODQ-NEXT: retq +; +; DQNOVL-LABEL: test_8f32tosb: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvttps2dq %ymm0, %ymm0 +; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 +; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; DQNOVL-NEXT: retq + %mask = fptosi <8 x float> %a to <8 x i1> + %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer + ret <8 x i64> %select +} + +define <16 x i32> @test_16f32tosb(<16 x float> %a, <16 x i32> %passthru) { +; NODQ-LABEL: test_16f32tosb: +; NODQ: # %bb.0: +; NODQ-NEXT: vcvttps2dq %zmm0, %zmm0 +; NODQ-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NODQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} +; NODQ-NEXT: retq +; +; VLDQ-LABEL: test_16f32tosb: +; VLDQ: # %bb.0: +; VLDQ-NEXT: vcvttps2dq %zmm0, %zmm0 +; VLDQ-NEXT: vpmovd2m %zmm0, %k1 +; VLDQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} +; VLDQ-NEXT: retq +; +; DQNOVL-LABEL: test_16f32tosb: +; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvttps2dq %zmm0, %zmm0 +; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 +; DQNOVL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} +; DQNOVL-NEXT: retq + %mask = fptosi <16 x float> %a to <16 x i1> + %select = select <16 x i1> %mask, <16 x i32> %passthru, <16 x i32> zeroinitializer + ret <16 x i32> %select +} diff --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll index 30e0cf0e74be..1d984f2f8ac1 100644 --- a/llvm/test/CodeGen/X86/avx512-cvt.ll +++ b/llvm/test/CodeGen/X86/avx512-cvt.ll @@ -6,7 +6,6 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLNOBW --check-prefix=AVX512VLDQ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLBW --check-prefix=AVX512VLBW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=KNL_WIDEN define <16 x float> @sitof32(<16 x i32> %a) nounwind { @@ -14,11 +13,6 @@ define <16 x float> @sitof32(<16 x i32> %a) nounwind { ; ALL: # %bb.0: ; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: sitof32: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvtdq2ps %zmm0, %zmm0 -; KNL_WIDEN-NEXT: retq %b = sitofp <16 x i32> %a to <16 x float> ret <16 x float> %b } @@ -63,36 +57,6 @@ define <8 x double> @sltof864(<8 x i64> %a) { ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: vcvtqq2pd %zmm0, %zmm0 ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: sltof864: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; KNL_WIDEN-NEXT: vpextrq $1, %xmm1, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 -; KNL_WIDEN-NEXT: vmovq %xmm1, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; KNL_WIDEN-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; KNL_WIDEN-NEXT: vpextrq $1, %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3 -; KNL_WIDEN-NEXT: vmovq %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; KNL_WIDEN-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; KNL_WIDEN-NEXT: vextracti128 $1, %ymm0, %xmm2 -; KNL_WIDEN-NEXT: vpextrq $1, %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3 -; KNL_WIDEN-NEXT: vmovq %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; KNL_WIDEN-NEXT: vpextrq $1, %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3 -; KNL_WIDEN-NEXT: vmovq %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm0 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; KNL_WIDEN-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; KNL_WIDEN-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; KNL_WIDEN-NEXT: retq %b = sitofp <8 x i64> %a to <8 x double> ret <8 x double> %b } @@ -125,22 +89,6 @@ define <4 x double> @slto4f64(<4 x i64> %a) { ; DQNOVL-NEXT: vcvtqq2pd %zmm0, %zmm0 ; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: slto4f64: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vextracti128 $1, %ymm0, %xmm1 -; KNL_WIDEN-NEXT: vpextrq $1, %xmm1, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 -; KNL_WIDEN-NEXT: vmovq %xmm1, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; KNL_WIDEN-NEXT: vpextrq $1, %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 -; KNL_WIDEN-NEXT: vmovq %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; KNL_WIDEN-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; KNL_WIDEN-NEXT: retq %b = sitofp <4 x i64> %a to <4 x double> ret <4 x double> %b } @@ -167,15 +115,6 @@ define <2 x double> @slto2f64(<2 x i64> %a) { ; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; DQNOVL-NEXT: vzeroupper ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: slto2f64: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpextrq $1, %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1 -; KNL_WIDEN-NEXT: vmovq %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; KNL_WIDEN-NEXT: retq %b = sitofp <2 x i64> %a to <2 x double> ret <2 x double> %b } @@ -204,17 +143,6 @@ define <2 x float> @sltof2f32(<2 x i64> %a) { ; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; DQNOVL-NEXT: vzeroupper ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: sltof2f32: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpextrq $1, %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; KNL_WIDEN-NEXT: vmovq %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 -; KNL_WIDEN-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] -; KNL_WIDEN-NEXT: retq %b = sitofp <2 x i64> %a to <2 x float> ret <2 x float>%b } @@ -249,23 +177,6 @@ define <4 x float> @slto4f32_mem(<4 x i64>* %a) { ; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; DQNOVL-NEXT: vzeroupper ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: slto4f32_mem: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vmovdqu (%rdi), %xmm0 -; KNL_WIDEN-NEXT: vmovdqu 16(%rdi), %xmm1 -; KNL_WIDEN-NEXT: vpextrq $1, %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; KNL_WIDEN-NEXT: vmovq %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] -; KNL_WIDEN-NEXT: vmovq %xmm1, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; KNL_WIDEN-NEXT: vpextrq $1, %xmm1, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; KNL_WIDEN-NEXT: retq %a1 = load <4 x i64>, <4 x i64>* %a, align 8 %b = sitofp <4 x i64> %a1 to <4 x float> ret <4 x float>%b @@ -301,24 +212,6 @@ define <4 x i64> @f64to4sl(<4 x double> %a) { ; DQNOVL-NEXT: vcvttpd2qq %zmm0, %zmm0 ; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: f64to4sl: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vextractf128 $1, %ymm0, %xmm1 -; KNL_WIDEN-NEXT: vcvttsd2si %xmm1, %rax -; KNL_WIDEN-NEXT: vmovq %rax, %xmm2 -; KNL_WIDEN-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; KNL_WIDEN-NEXT: vcvttsd2si %xmm1, %rax -; KNL_WIDEN-NEXT: vmovq %rax, %xmm1 -; KNL_WIDEN-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; KNL_WIDEN-NEXT: vcvttsd2si %xmm0, %rax -; KNL_WIDEN-NEXT: vmovq %rax, %xmm2 -; KNL_WIDEN-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; KNL_WIDEN-NEXT: vcvttsd2si %xmm0, %rax -; KNL_WIDEN-NEXT: vmovq %rax, %xmm0 -; KNL_WIDEN-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; KNL_WIDEN-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; KNL_WIDEN-NEXT: retq %b = fptosi <4 x double> %a to <4 x i64> ret <4 x i64> %b } @@ -353,24 +246,6 @@ define <4 x i64> @f32to4sl(<4 x float> %a) { ; DQNOVL-NEXT: vcvttps2qq %ymm0, %zmm0 ; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: f32to4sl: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; KNL_WIDEN-NEXT: vcvttss2si %xmm1, %rax -; KNL_WIDEN-NEXT: vmovq %rax, %xmm1 -; KNL_WIDEN-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; KNL_WIDEN-NEXT: vcvttss2si %xmm2, %rax -; KNL_WIDEN-NEXT: vmovq %rax, %xmm2 -; KNL_WIDEN-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; KNL_WIDEN-NEXT: vcvttss2si %xmm0, %rax -; KNL_WIDEN-NEXT: vmovq %rax, %xmm2 -; KNL_WIDEN-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; KNL_WIDEN-NEXT: vcvttss2si %xmm0, %rax -; KNL_WIDEN-NEXT: vmovq %rax, %xmm0 -; KNL_WIDEN-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; KNL_WIDEN-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; KNL_WIDEN-NEXT: retq %b = fptosi <4 x float> %a to <4 x i64> ret <4 x i64> %b } @@ -406,23 +281,6 @@ define <4 x float> @slto4f32(<4 x i64> %a) { ; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; DQNOVL-NEXT: vzeroupper ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: slto4f32: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpextrq $1, %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; KNL_WIDEN-NEXT: vmovq %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; KNL_WIDEN-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL_WIDEN-NEXT: vmovq %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; KNL_WIDEN-NEXT: vpextrq $1, %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; KNL_WIDEN-NEXT: vzeroupper -; KNL_WIDEN-NEXT: retq %b = sitofp <4 x i64> %a to <4 x float> ret <4 x float> %b } @@ -458,23 +316,6 @@ define <4 x float> @ulto4f32(<4 x i64> %a) { ; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; DQNOVL-NEXT: vzeroupper ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: ulto4f32: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpextrq $1, %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 -; KNL_WIDEN-NEXT: vmovq %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; KNL_WIDEN-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL_WIDEN-NEXT: vmovq %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; KNL_WIDEN-NEXT: vpextrq $1, %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; KNL_WIDEN-NEXT: vzeroupper -; KNL_WIDEN-NEXT: retq %b = uitofp <4 x i64> %a to <4 x float> ret <4 x float> %b } @@ -499,16 +340,6 @@ define <8 x double> @ulto8f64(<8 x i64> %a) { ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: vcvtuqq2pd %zmm0, %zmm0 ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: ulto8f64: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1 -; KNL_WIDEN-NEXT: vporq {{.*}}(%rip){1to8}, %zmm1, %zmm1 -; KNL_WIDEN-NEXT: vpsrlq $32, %zmm0, %zmm0 -; KNL_WIDEN-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; KNL_WIDEN-NEXT: vsubpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; KNL_WIDEN-NEXT: vaddpd %zmm0, %zmm1, %zmm0 -; KNL_WIDEN-NEXT: retq %b = uitofp <8 x i64> %a to <8 x double> ret <8 x double> %b } @@ -545,26 +376,6 @@ define <16 x double> @ulto16f64(<16 x i64> %a) { ; DQNOVL-NEXT: vcvtuqq2pd %zmm0, %zmm0 ; DQNOVL-NEXT: vcvtuqq2pd %zmm1, %zmm1 ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: ulto16f64: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpbroadcastq {{.*#+}} zmm2 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295] -; KNL_WIDEN-NEXT: vpandq %zmm2, %zmm0, %zmm3 -; KNL_WIDEN-NEXT: vpbroadcastq {{.*#+}} zmm4 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] -; KNL_WIDEN-NEXT: vporq %zmm4, %zmm3, %zmm3 -; KNL_WIDEN-NEXT: vpsrlq $32, %zmm0, %zmm0 -; KNL_WIDEN-NEXT: vpbroadcastq {{.*#+}} zmm5 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] -; KNL_WIDEN-NEXT: vporq %zmm5, %zmm0, %zmm0 -; KNL_WIDEN-NEXT: vbroadcastsd {{.*#+}} zmm6 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] -; KNL_WIDEN-NEXT: vsubpd %zmm6, %zmm0, %zmm0 -; KNL_WIDEN-NEXT: vaddpd %zmm0, %zmm3, %zmm0 -; KNL_WIDEN-NEXT: vpandq %zmm2, %zmm1, %zmm2 -; KNL_WIDEN-NEXT: vporq %zmm4, %zmm2, %zmm2 -; KNL_WIDEN-NEXT: vpsrlq $32, %zmm1, %zmm1 -; KNL_WIDEN-NEXT: vporq %zmm5, %zmm1, %zmm1 -; KNL_WIDEN-NEXT: vsubpd %zmm6, %zmm1, %zmm1 -; KNL_WIDEN-NEXT: vaddpd %zmm1, %zmm2, %zmm1 -; KNL_WIDEN-NEXT: retq %b = uitofp <16 x i64> %a to <16 x double> ret <16 x double> %b } @@ -574,11 +385,6 @@ define <16 x i32> @f64to16si(<16 x float> %a) nounwind { ; ALL: # %bb.0: ; ALL-NEXT: vcvttps2dq %zmm0, %zmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: f64to16si: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvttps2dq %zmm0, %zmm0 -; KNL_WIDEN-NEXT: retq %b = fptosi <16 x float> %a to <16 x i32> ret <16 x i32> %b } @@ -590,13 +396,6 @@ define <16 x i8> @f32to16sc(<16 x float> %f) { ; ALL-NEXT: vpmovdb %zmm0, %xmm0 ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: f32to16sc: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvttps2dq %zmm0, %zmm0 -; KNL_WIDEN-NEXT: vpmovdb %zmm0, %xmm0 -; KNL_WIDEN-NEXT: vzeroupper -; KNL_WIDEN-NEXT: retq %res = fptosi <16 x float> %f to <16 x i8> ret <16 x i8> %res } @@ -607,12 +406,6 @@ define <16 x i16> @f32to16ss(<16 x float> %f) { ; ALL-NEXT: vcvttps2dq %zmm0, %zmm0 ; ALL-NEXT: vpmovdw %zmm0, %ymm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: f32to16ss: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvttps2dq %zmm0, %zmm0 -; KNL_WIDEN-NEXT: vpmovdw %zmm0, %ymm0 -; KNL_WIDEN-NEXT: retq %res = fptosi <16 x float> %f to <16 x i16> ret <16 x i16> %res } @@ -622,11 +415,6 @@ define <16 x i32> @f32to16ui(<16 x float> %a) nounwind { ; ALL: # %bb.0: ; ALL-NEXT: vcvttps2udq %zmm0, %zmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: f32to16ui: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvttps2udq %zmm0, %zmm0 -; KNL_WIDEN-NEXT: retq %b = fptoui <16 x float> %a to <16 x i32> ret <16 x i32> %b } @@ -638,13 +426,6 @@ define <16 x i8> @f32to16uc(<16 x float> %f) { ; ALL-NEXT: vpmovdb %zmm0, %xmm0 ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: f32to16uc: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvttps2dq %zmm0, %zmm0 -; KNL_WIDEN-NEXT: vpmovdb %zmm0, %xmm0 -; KNL_WIDEN-NEXT: vzeroupper -; KNL_WIDEN-NEXT: retq %res = fptoui <16 x float> %f to <16 x i8> ret <16 x i8> %res } @@ -655,12 +436,6 @@ define <16 x i16> @f32to16us(<16 x float> %f) { ; ALL-NEXT: vcvttps2dq %zmm0, %zmm0 ; ALL-NEXT: vpmovdw %zmm0, %ymm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: f32to16us: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvttps2dq %zmm0, %zmm0 -; KNL_WIDEN-NEXT: vpmovdw %zmm0, %ymm0 -; KNL_WIDEN-NEXT: retq %res = fptoui <16 x float> %f to <16 x i16> ret <16 x i16> %res } @@ -677,13 +452,6 @@ define <8 x i32> @f32to8ui(<8 x float> %a) nounwind { ; VL: # %bb.0: ; VL-NEXT: vcvttps2udq %ymm0, %ymm0 ; VL-NEXT: retq -; -; KNL_WIDEN-LABEL: f32to8ui: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL_WIDEN-NEXT: vcvttps2udq %zmm0, %zmm0 -; KNL_WIDEN-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; KNL_WIDEN-NEXT: retq %b = fptoui <8 x float> %a to <8 x i32> ret <8 x i32> %b } @@ -701,14 +469,6 @@ define <4 x i32> @f32to4ui(<4 x float> %a) nounwind { ; VL: # %bb.0: ; VL-NEXT: vcvttps2udq %xmm0, %xmm0 ; VL-NEXT: retq -; -; KNL_WIDEN-LABEL: f32to4ui: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL_WIDEN-NEXT: vcvttps2udq %zmm0, %zmm0 -; KNL_WIDEN-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; KNL_WIDEN-NEXT: vzeroupper -; KNL_WIDEN-NEXT: retq %b = fptoui <4 x float> %a to <4 x i32> ret <4 x i32> %b } @@ -718,11 +478,6 @@ define <8 x i32> @f64to8ui(<8 x double> %a) nounwind { ; ALL: # %bb.0: ; ALL-NEXT: vcvttpd2udq %zmm0, %ymm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: f64to8ui: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvttpd2udq %zmm0, %ymm0 -; KNL_WIDEN-NEXT: retq %b = fptoui <8 x double> %a to <8 x i32> ret <8 x i32> %b } @@ -742,14 +497,6 @@ define <8 x i16> @f64to8us(<8 x double> %f) { ; VL-NEXT: vpmovdw %ymm0, %xmm0 ; VL-NEXT: vzeroupper ; VL-NEXT: retq -; -; KNL_WIDEN-LABEL: f64to8us: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvttpd2dq %zmm0, %ymm0 -; KNL_WIDEN-NEXT: vpmovdw %zmm0, %ymm0 -; KNL_WIDEN-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; KNL_WIDEN-NEXT: vzeroupper -; KNL_WIDEN-NEXT: retq %res = fptoui <8 x double> %f to <8 x i16> ret <8 x i16> %res } @@ -769,34 +516,6 @@ define <8 x i8> @f64to8uc(<8 x double> %f) { ; VL-NEXT: vpmovdw %ymm0, %xmm0 ; VL-NEXT: vzeroupper ; VL-NEXT: retq -; -; KNL_WIDEN-LABEL: f64to8uc: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; KNL_WIDEN-NEXT: vcvttsd2si %xmm1, %eax -; KNL_WIDEN-NEXT: vcvttsd2si %xmm0, %ecx -; KNL_WIDEN-NEXT: vmovd %ecx, %xmm1 -; KNL_WIDEN-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; KNL_WIDEN-NEXT: vextractf128 $1, %ymm0, %xmm2 -; KNL_WIDEN-NEXT: vcvttsd2si %xmm2, %eax -; KNL_WIDEN-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; KNL_WIDEN-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] -; KNL_WIDEN-NEXT: vcvttsd2si %xmm2, %eax -; KNL_WIDEN-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; KNL_WIDEN-NEXT: vextractf32x4 $2, %zmm0, %xmm2 -; KNL_WIDEN-NEXT: vcvttsd2si %xmm2, %eax -; KNL_WIDEN-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; KNL_WIDEN-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] -; KNL_WIDEN-NEXT: vcvttsd2si %xmm2, %eax -; KNL_WIDEN-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; KNL_WIDEN-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; KNL_WIDEN-NEXT: vcvttsd2si %xmm0, %eax -; KNL_WIDEN-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; KNL_WIDEN-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; KNL_WIDEN-NEXT: vcvttsd2si %xmm0, %eax -; KNL_WIDEN-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0 -; KNL_WIDEN-NEXT: vzeroupper -; KNL_WIDEN-NEXT: retq %res = fptoui <8 x double> %f to <8 x i8> ret <8 x i8> %res } @@ -815,14 +534,6 @@ define <4 x i32> @f64to4ui(<4 x double> %a) nounwind { ; VL-NEXT: vcvttpd2udq %ymm0, %xmm0 ; VL-NEXT: vzeroupper ; VL-NEXT: retq -; -; KNL_WIDEN-LABEL: f64to4ui: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL_WIDEN-NEXT: vcvttpd2udq %zmm0, %ymm0 -; KNL_WIDEN-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; KNL_WIDEN-NEXT: vzeroupper -; KNL_WIDEN-NEXT: retq %b = fptoui <4 x double> %a to <4 x i32> ret <4 x i32> %b } @@ -832,11 +543,6 @@ define <8 x double> @sito8f64(<8 x i32> %a) { ; ALL: # %bb.0: ; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: sito8f64: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvtdq2pd %ymm0, %zmm0 -; KNL_WIDEN-NEXT: retq %b = sitofp <8 x i32> %a to <8 x double> ret <8 x double> %b } @@ -870,12 +576,6 @@ define <8 x double> @i32to8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwi ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} ; AVX512BW-NEXT: retq -; -; KNL_WIDEN-LABEL: i32to8f64_mask: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: kmovw %edi, %k1 -; KNL_WIDEN-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} -; KNL_WIDEN-NEXT: retq %1 = bitcast i8 %c to <8 x i1> %2 = sitofp <8 x i32> %b to <8 x double> %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a @@ -911,12 +611,6 @@ define <8 x double> @sito8f64_maskz(<8 x i32> %a, i8 %b) nounwind { ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: retq -; -; KNL_WIDEN-LABEL: sito8f64_maskz: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: kmovw %edi, %k1 -; KNL_WIDEN-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} -; KNL_WIDEN-NEXT: retq %1 = bitcast i8 %b to <8 x i1> %2 = sitofp <8 x i32> %a to <8 x double> %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer @@ -928,11 +622,6 @@ define <8 x i32> @f64to8si(<8 x double> %a) { ; ALL: # %bb.0: ; ALL-NEXT: vcvttpd2dq %zmm0, %ymm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: f64to8si: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvttpd2dq %zmm0, %ymm0 -; KNL_WIDEN-NEXT: retq %b = fptosi <8 x double> %a to <8 x i32> ret <8 x i32> %b } @@ -943,12 +632,6 @@ define <4 x i32> @f64to4si(<4 x double> %a) { ; ALL-NEXT: vcvttpd2dq %ymm0, %xmm0 ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: f64to4si: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvttpd2dq %ymm0, %xmm0 -; KNL_WIDEN-NEXT: vzeroupper -; KNL_WIDEN-NEXT: retq %b = fptosi <4 x double> %a to <4 x i32> ret <4 x i32> %b } @@ -960,13 +643,6 @@ define <16 x float> @f64to16f32(<16 x double> %b) nounwind { ; ALL-NEXT: vcvtpd2ps %zmm1, %ymm1 ; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: f64to16f32: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvtpd2ps %zmm0, %ymm0 -; KNL_WIDEN-NEXT: vcvtpd2ps %zmm1, %ymm1 -; KNL_WIDEN-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; KNL_WIDEN-NEXT: retq %a = fptrunc <16 x double> %b to <16 x float> ret <16 x float> %a } @@ -977,12 +653,6 @@ define <4 x float> @f64to4f32(<4 x double> %b) { ; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0 ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: f64to4f32: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvtpd2ps %ymm0, %xmm0 -; KNL_WIDEN-NEXT: vzeroupper -; KNL_WIDEN-NEXT: retq %a = fptrunc <4 x double> %b to <4 x float> ret <4 x float> %a } @@ -1023,16 +693,6 @@ define <4 x float> @f64to4f32_mask(<4 x double> %b, <4 x i1> %mask) { ; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; DQNOVL-NEXT: vzeroupper ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: f64to4f32_mask: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpslld $31, %xmm1, %xmm1 -; KNL_WIDEN-NEXT: vptestmd %zmm1, %zmm1, %k1 -; KNL_WIDEN-NEXT: vcvtpd2ps %ymm0, %xmm0 -; KNL_WIDEN-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} -; KNL_WIDEN-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; KNL_WIDEN-NEXT: vzeroupper -; KNL_WIDEN-NEXT: retq %a = fptrunc <4 x double> %b to <4 x float> %c = select <4 x i1>%mask, <4 x float>%a, <4 x float> zeroinitializer ret <4 x float> %c @@ -1043,11 +703,6 @@ define <4 x float> @f64tof32_inreg(<2 x double> %a0, <4 x float> %a1) nounwind { ; ALL: # %bb.0: ; ALL-NEXT: vcvtsd2ss %xmm0, %xmm1, %xmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: f64tof32_inreg: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvtsd2ss %xmm0, %xmm1, %xmm0 -; KNL_WIDEN-NEXT: retq %ext = extractelement <2 x double> %a0, i32 0 %cvt = fptrunc double %ext to float %res = insertelement <4 x float> %a1, float %cvt, i32 0 @@ -1059,11 +714,6 @@ define <8 x double> @f32to8f64(<8 x float> %b) nounwind { ; ALL: # %bb.0: ; ALL-NEXT: vcvtps2pd %ymm0, %zmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: f32to8f64: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvtps2pd %ymm0, %zmm0 -; KNL_WIDEN-NEXT: retq %a = fpext <8 x float> %b to <8 x double> ret <8 x double> %a } @@ -1084,16 +734,6 @@ define <4 x double> @f32to4f64_mask(<4 x float> %b, <4 x double> %b1, <4 x doubl ; VL-NEXT: vcmpltpd %ymm2, %ymm1, %k1 ; VL-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z} ; VL-NEXT: retq -; -; KNL_WIDEN-LABEL: f32to4f64_mask: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 -; KNL_WIDEN-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; KNL_WIDEN-NEXT: vcvtps2pd %xmm0, %ymm0 -; KNL_WIDEN-NEXT: vcmpltpd %zmm2, %zmm1, %k1 -; KNL_WIDEN-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} -; KNL_WIDEN-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; KNL_WIDEN-NEXT: retq %a = fpext <4 x float> %b to <4 x double> %mask = fcmp ogt <4 x double> %a1, %b1 %c = select <4 x i1> %mask, <4 x double> %a, <4 x double> zeroinitializer @@ -1105,11 +745,6 @@ define <2 x double> @f32tof64_inreg(<2 x double> %a0, <4 x float> %a1) nounwind ; ALL: # %bb.0: ; ALL-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: f32tof64_inreg: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0 -; KNL_WIDEN-NEXT: retq %ext = extractelement <4 x float> %a1, i32 0 %cvt = fpext float %ext to double %res = insertelement <2 x double> %a0, double %cvt, i32 0 @@ -1121,11 +756,6 @@ define double @sltof64_load(i64* nocapture %e) { ; ALL: # %bb.0: # %entry ; ALL-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: sltof64_load: -; KNL_WIDEN: # %bb.0: # %entry -; KNL_WIDEN-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0 -; KNL_WIDEN-NEXT: retq entry: %tmp1 = load i64, i64* %e, align 8 %conv = sitofp i64 %tmp1 to double @@ -1137,11 +767,6 @@ define double @sitof64_load(i32* %e) { ; ALL: # %bb.0: # %entry ; ALL-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: sitof64_load: -; KNL_WIDEN: # %bb.0: # %entry -; KNL_WIDEN-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0 -; KNL_WIDEN-NEXT: retq entry: %tmp1 = load i32, i32* %e, align 4 %conv = sitofp i32 %tmp1 to double @@ -1153,11 +778,6 @@ define float @sitof32_load(i32* %e) { ; ALL: # %bb.0: # %entry ; ALL-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: sitof32_load: -; KNL_WIDEN: # %bb.0: # %entry -; KNL_WIDEN-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0 -; KNL_WIDEN-NEXT: retq entry: %tmp1 = load i32, i32* %e, align 4 %conv = sitofp i32 %tmp1 to float @@ -1169,11 +789,6 @@ define float @sltof32_load(i64* %e) { ; ALL: # %bb.0: # %entry ; ALL-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: sltof32_load: -; KNL_WIDEN: # %bb.0: # %entry -; KNL_WIDEN-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0 -; KNL_WIDEN-NEXT: retq entry: %tmp1 = load i64, i64* %e, align 8 %conv = sitofp i64 %tmp1 to float @@ -1187,13 +802,6 @@ define void @f32tof64_loadstore() { ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; ALL-NEXT: vmovsd %xmm0, -{{[0-9]+}}(%rsp) ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: f32tof64_loadstore: -; KNL_WIDEN: # %bb.0: # %entry -; KNL_WIDEN-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; KNL_WIDEN-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; KNL_WIDEN-NEXT: vmovsd %xmm0, -{{[0-9]+}}(%rsp) -; KNL_WIDEN-NEXT: retq entry: %f = alloca float, align 4 %d = alloca double, align 8 @@ -1210,13 +818,6 @@ define void @f64tof32_loadstore() nounwind uwtable { ; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 ; ALL-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp) ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: f64tof32_loadstore: -; KNL_WIDEN: # %bb.0: # %entry -; KNL_WIDEN-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; KNL_WIDEN-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; KNL_WIDEN-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp) -; KNL_WIDEN-NEXT: retq entry: %f = alloca float, align 4 %d = alloca double, align 8 @@ -1231,11 +832,6 @@ define double @long_to_double(i64 %x) { ; ALL: # %bb.0: ; ALL-NEXT: vmovq %rdi, %xmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: long_to_double: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vmovq %rdi, %xmm0 -; KNL_WIDEN-NEXT: retq %res = bitcast i64 %x to double ret double %res } @@ -1245,11 +841,6 @@ define i64 @double_to_long(double %x) { ; ALL: # %bb.0: ; ALL-NEXT: vmovq %xmm0, %rax ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: double_to_long: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vmovq %xmm0, %rax -; KNL_WIDEN-NEXT: retq %res = bitcast double %x to i64 ret i64 %res } @@ -1259,11 +850,6 @@ define float @int_to_float(i32 %x) { ; ALL: # %bb.0: ; ALL-NEXT: vmovd %edi, %xmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: int_to_float: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vmovd %edi, %xmm0 -; KNL_WIDEN-NEXT: retq %res = bitcast i32 %x to float ret float %res } @@ -1273,11 +859,6 @@ define i32 @float_to_int(float %x) { ; ALL: # %bb.0: ; ALL-NEXT: vmovd %xmm0, %eax ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: float_to_int: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vmovd %xmm0, %eax -; KNL_WIDEN-NEXT: retq %res = bitcast float %x to i32 ret i32 %res } @@ -1290,14 +871,6 @@ define <16 x double> @uito16f64(<16 x i32> %a) nounwind { ; ALL-NEXT: vcvtudq2pd %ymm0, %zmm1 ; ALL-NEXT: vmovaps %zmm2, %zmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: uito16f64: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvtudq2pd %ymm0, %zmm2 -; KNL_WIDEN-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; KNL_WIDEN-NEXT: vcvtudq2pd %ymm0, %zmm1 -; KNL_WIDEN-NEXT: vmovaps %zmm2, %zmm0 -; KNL_WIDEN-NEXT: retq %b = uitofp <16 x i32> %a to <16 x double> ret <16 x double> %b } @@ -1342,36 +915,6 @@ define <8 x float> @slto8f32(<8 x i64> %a) { ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: vcvtqq2ps %zmm0, %ymm0 ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: slto8f32: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; KNL_WIDEN-NEXT: vpextrq $1, %xmm1, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; KNL_WIDEN-NEXT: vmovq %xmm1, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; KNL_WIDEN-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; KNL_WIDEN-NEXT: vmovq %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; KNL_WIDEN-NEXT: vpextrq $1, %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; KNL_WIDEN-NEXT: vpextrq $1, %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 -; KNL_WIDEN-NEXT: vmovq %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; KNL_WIDEN-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL_WIDEN-NEXT: vmovq %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; KNL_WIDEN-NEXT: vpextrq $1, %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; KNL_WIDEN-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; KNL_WIDEN-NEXT: retq %b = sitofp <8 x i64> %a to <8 x float> ret <8 x float> %b } @@ -1447,63 +990,6 @@ define <16 x float> @slto16f32(<16 x i64> %a) { ; DQNOVL-NEXT: vcvtqq2ps %zmm1, %ymm1 ; DQNOVL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: slto16f32: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vextracti32x4 $2, %zmm1, %xmm2 -; KNL_WIDEN-NEXT: vpextrq $1, %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 -; KNL_WIDEN-NEXT: vmovq %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; KNL_WIDEN-NEXT: vextracti32x4 $3, %zmm1, %xmm3 -; KNL_WIDEN-NEXT: vmovq %xmm3, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] -; KNL_WIDEN-NEXT: vpextrq $1, %xmm3, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] -; KNL_WIDEN-NEXT: vpextrq $1, %xmm1, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 -; KNL_WIDEN-NEXT: vmovq %xmm1, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] -; KNL_WIDEN-NEXT: vextracti128 $1, %ymm1, %xmm1 -; KNL_WIDEN-NEXT: vmovq %xmm1, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] -; KNL_WIDEN-NEXT: vpextrq $1, %xmm1, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm1 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] -; KNL_WIDEN-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; KNL_WIDEN-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; KNL_WIDEN-NEXT: vpextrq $1, %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 -; KNL_WIDEN-NEXT: vmovq %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; KNL_WIDEN-NEXT: vextracti32x4 $3, %zmm0, %xmm3 -; KNL_WIDEN-NEXT: vmovq %xmm3, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] -; KNL_WIDEN-NEXT: vpextrq $1, %xmm3, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] -; KNL_WIDEN-NEXT: vpextrq $1, %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 -; KNL_WIDEN-NEXT: vmovq %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] -; KNL_WIDEN-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL_WIDEN-NEXT: vmovq %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] -; KNL_WIDEN-NEXT: vpextrq $1, %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm0 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] -; KNL_WIDEN-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; KNL_WIDEN-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; KNL_WIDEN-NEXT: retq %b = sitofp <16 x i64> %a to <16 x float> ret <16 x float> %b } @@ -1548,36 +1034,6 @@ define <8 x double> @slto8f64(<8 x i64> %a) { ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: vcvtqq2pd %zmm0, %zmm0 ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: slto8f64: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; KNL_WIDEN-NEXT: vpextrq $1, %xmm1, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 -; KNL_WIDEN-NEXT: vmovq %xmm1, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; KNL_WIDEN-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; KNL_WIDEN-NEXT: vpextrq $1, %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3 -; KNL_WIDEN-NEXT: vmovq %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; KNL_WIDEN-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; KNL_WIDEN-NEXT: vextracti128 $1, %ymm0, %xmm2 -; KNL_WIDEN-NEXT: vpextrq $1, %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3 -; KNL_WIDEN-NEXT: vmovq %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; KNL_WIDEN-NEXT: vpextrq $1, %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3 -; KNL_WIDEN-NEXT: vmovq %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm0 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; KNL_WIDEN-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; KNL_WIDEN-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; KNL_WIDEN-NEXT: retq %b = sitofp <8 x i64> %a to <8 x double> ret <8 x double> %b } @@ -1650,62 +1106,6 @@ define <16 x double> @slto16f64(<16 x i64> %a) { ; DQNOVL-NEXT: vcvtqq2pd %zmm0, %zmm0 ; DQNOVL-NEXT: vcvtqq2pd %zmm1, %zmm1 ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: slto16f64: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; KNL_WIDEN-NEXT: vpextrq $1, %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3 -; KNL_WIDEN-NEXT: vmovq %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; KNL_WIDEN-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; KNL_WIDEN-NEXT: vpextrq $1, %xmm3, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm4 -; KNL_WIDEN-NEXT: vmovq %xmm3, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; KNL_WIDEN-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; KNL_WIDEN-NEXT: vextracti128 $1, %ymm0, %xmm3 -; KNL_WIDEN-NEXT: vpextrq $1, %xmm3, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 -; KNL_WIDEN-NEXT: vmovq %xmm3, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; KNL_WIDEN-NEXT: vpextrq $1, %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 -; KNL_WIDEN-NEXT: vmovq %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm0 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; KNL_WIDEN-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; KNL_WIDEN-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; KNL_WIDEN-NEXT: vextracti32x4 $3, %zmm1, %xmm2 -; KNL_WIDEN-NEXT: vpextrq $1, %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 -; KNL_WIDEN-NEXT: vmovq %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm2 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; KNL_WIDEN-NEXT: vextracti32x4 $2, %zmm1, %xmm3 -; KNL_WIDEN-NEXT: vpextrq $1, %xmm3, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 -; KNL_WIDEN-NEXT: vmovq %xmm3, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; KNL_WIDEN-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; KNL_WIDEN-NEXT: vextracti128 $1, %ymm1, %xmm3 -; KNL_WIDEN-NEXT: vpextrq $1, %xmm3, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 -; KNL_WIDEN-NEXT: vmovq %xmm3, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; KNL_WIDEN-NEXT: vpextrq $1, %xmm1, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4 -; KNL_WIDEN-NEXT: vmovq %xmm1, %rax -; KNL_WIDEN-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm1 -; KNL_WIDEN-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; KNL_WIDEN-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; KNL_WIDEN-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1 -; KNL_WIDEN-NEXT: retq %b = sitofp <16 x i64> %a to <16 x double> ret <16 x double> %b } @@ -1750,36 +1150,6 @@ define <8 x float> @ulto8f32(<8 x i64> %a) { ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: vcvtuqq2ps %zmm0, %ymm0 ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: ulto8f32: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; KNL_WIDEN-NEXT: vpextrq $1, %xmm1, %rax -; KNL_WIDEN-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 -; KNL_WIDEN-NEXT: vmovq %xmm1, %rax -; KNL_WIDEN-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; KNL_WIDEN-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; KNL_WIDEN-NEXT: vmovq %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm3 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; KNL_WIDEN-NEXT: vpextrq $1, %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; KNL_WIDEN-NEXT: vpextrq $1, %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 -; KNL_WIDEN-NEXT: vmovq %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; KNL_WIDEN-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL_WIDEN-NEXT: vmovq %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; KNL_WIDEN-NEXT: vpextrq $1, %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm0 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; KNL_WIDEN-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; KNL_WIDEN-NEXT: retq %b = uitofp <8 x i64> %a to <8 x float> ret <8 x float> %b } @@ -1855,63 +1225,6 @@ define <16 x float> @ulto16f32(<16 x i64> %a) { ; DQNOVL-NEXT: vcvtuqq2ps %zmm1, %ymm1 ; DQNOVL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: ulto16f32: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vextracti32x4 $2, %zmm1, %xmm2 -; KNL_WIDEN-NEXT: vpextrq $1, %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm3 -; KNL_WIDEN-NEXT: vmovq %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; KNL_WIDEN-NEXT: vextracti32x4 $3, %zmm1, %xmm3 -; KNL_WIDEN-NEXT: vmovq %xmm3, %rax -; KNL_WIDEN-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm4 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] -; KNL_WIDEN-NEXT: vpextrq $1, %xmm3, %rax -; KNL_WIDEN-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] -; KNL_WIDEN-NEXT: vpextrq $1, %xmm1, %rax -; KNL_WIDEN-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 -; KNL_WIDEN-NEXT: vmovq %xmm1, %rax -; KNL_WIDEN-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] -; KNL_WIDEN-NEXT: vextracti128 $1, %ymm1, %xmm1 -; KNL_WIDEN-NEXT: vmovq %xmm1, %rax -; KNL_WIDEN-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] -; KNL_WIDEN-NEXT: vpextrq $1, %xmm1, %rax -; KNL_WIDEN-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm1 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] -; KNL_WIDEN-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; KNL_WIDEN-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; KNL_WIDEN-NEXT: vpextrq $1, %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 -; KNL_WIDEN-NEXT: vmovq %xmm2, %rax -; KNL_WIDEN-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm2 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; KNL_WIDEN-NEXT: vextracti32x4 $3, %zmm0, %xmm3 -; KNL_WIDEN-NEXT: vmovq %xmm3, %rax -; KNL_WIDEN-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] -; KNL_WIDEN-NEXT: vpextrq $1, %xmm3, %rax -; KNL_WIDEN-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] -; KNL_WIDEN-NEXT: vpextrq $1, %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 -; KNL_WIDEN-NEXT: vmovq %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] -; KNL_WIDEN-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL_WIDEN-NEXT: vmovq %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] -; KNL_WIDEN-NEXT: vpextrq $1, %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm0 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] -; KNL_WIDEN-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; KNL_WIDEN-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; KNL_WIDEN-NEXT: retq %b = uitofp <16 x i64> %a to <16 x float> ret <16 x float> %b } @@ -1946,12 +1259,6 @@ define <8 x double> @uito8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwin ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} ; AVX512BW-NEXT: retq -; -; KNL_WIDEN-LABEL: uito8f64_mask: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: kmovw %edi, %k1 -; KNL_WIDEN-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} -; KNL_WIDEN-NEXT: retq %1 = bitcast i8 %c to <8 x i1> %2 = uitofp <8 x i32> %b to <8 x double> %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a @@ -1987,12 +1294,6 @@ define <8 x double> @uito8f64_maskz(<8 x i32> %a, i8 %b) nounwind { ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: retq -; -; KNL_WIDEN-LABEL: uito8f64_maskz: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: kmovw %edi, %k1 -; KNL_WIDEN-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} -; KNL_WIDEN-NEXT: retq %1 = bitcast i8 %b to <8 x i1> %2 = uitofp <8 x i32> %a to <8 x double> %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer @@ -2011,13 +1312,6 @@ define <4 x double> @uito4f64(<4 x i32> %a) nounwind { ; VL: # %bb.0: ; VL-NEXT: vcvtudq2pd %xmm0, %ymm0 ; VL-NEXT: retq -; -; KNL_WIDEN-LABEL: uito4f64: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; KNL_WIDEN-NEXT: vcvtudq2pd %ymm0, %zmm0 -; KNL_WIDEN-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; KNL_WIDEN-NEXT: retq %b = uitofp <4 x i32> %a to <4 x double> ret <4 x double> %b } @@ -2027,11 +1321,6 @@ define <16 x float> @uito16f32(<16 x i32> %a) nounwind { ; ALL: # %bb.0: ; ALL-NEXT: vcvtudq2ps %zmm0, %zmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: uito16f32: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvtudq2ps %zmm0, %zmm0 -; KNL_WIDEN-NEXT: retq %b = uitofp <16 x i32> %a to <16 x float> ret <16 x float> %b } @@ -2041,11 +1330,6 @@ define <8 x double> @uito8f64(<8 x i32> %a) { ; ALL: # %bb.0: ; ALL-NEXT: vcvtudq2pd %ymm0, %zmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: uito8f64: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvtudq2pd %ymm0, %zmm0 -; KNL_WIDEN-NEXT: retq %b = uitofp <8 x i32> %a to <8 x double> ret <8 x double> %b } @@ -2062,13 +1346,6 @@ define <8 x float> @uito8f32(<8 x i32> %a) nounwind { ; VL: # %bb.0: ; VL-NEXT: vcvtudq2ps %ymm0, %ymm0 ; VL-NEXT: retq -; -; KNL_WIDEN-LABEL: uito8f32: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL_WIDEN-NEXT: vcvtudq2ps %zmm0, %zmm0 -; KNL_WIDEN-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; KNL_WIDEN-NEXT: retq %b = uitofp <8 x i32> %a to <8 x float> ret <8 x float> %b } @@ -2086,14 +1363,6 @@ define <4 x float> @uito4f32(<4 x i32> %a) nounwind { ; VL: # %bb.0: ; VL-NEXT: vcvtudq2ps %xmm0, %xmm0 ; VL-NEXT: retq -; -; KNL_WIDEN-LABEL: uito4f32: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL_WIDEN-NEXT: vcvtudq2ps %zmm0, %zmm0 -; KNL_WIDEN-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; KNL_WIDEN-NEXT: vzeroupper -; KNL_WIDEN-NEXT: retq %b = uitofp <4 x i32> %a to <4 x float> ret <4 x float> %b } @@ -2103,11 +1372,6 @@ define i32 @fptosi(float %a) nounwind { ; ALL: # %bb.0: ; ALL-NEXT: vcvttss2si %xmm0, %eax ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: fptosi: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvttss2si %xmm0, %eax -; KNL_WIDEN-NEXT: retq %b = fptosi float %a to i32 ret i32 %b } @@ -2117,11 +1381,6 @@ define i32 @fptoui(float %a) nounwind { ; ALL: # %bb.0: ; ALL-NEXT: vcvttss2usi %xmm0, %eax ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: fptoui: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvttss2usi %xmm0, %eax -; KNL_WIDEN-NEXT: retq %b = fptoui float %a to i32 ret i32 %b } @@ -2131,11 +1390,6 @@ define float @uitof32(i32 %a) nounwind { ; ALL: # %bb.0: ; ALL-NEXT: vcvtusi2ssl %edi, %xmm0, %xmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: uitof32: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvtusi2ssl %edi, %xmm0, %xmm0 -; KNL_WIDEN-NEXT: retq %b = uitofp i32 %a to float ret float %b } @@ -2145,11 +1399,6 @@ define double @uitof64(i32 %a) nounwind { ; ALL: # %bb.0: ; ALL-NEXT: vcvtusi2sdl %edi, %xmm0, %xmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: uitof64: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvtusi2sdl %edi, %xmm0, %xmm0 -; KNL_WIDEN-NEXT: retq %b = uitofp i32 %a to double ret double %b } @@ -2176,14 +1425,6 @@ define <16 x float> @sbto16f32(<16 x i32> %a) { ; DQNOVL-NEXT: vpmovm2d %k0, %zmm0 ; DQNOVL-NEXT: vcvtdq2ps %zmm0, %zmm0 ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: sbto16f32: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL_WIDEN-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; KNL_WIDEN-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL_WIDEN-NEXT: vcvtdq2ps %zmm0, %zmm0 -; KNL_WIDEN-NEXT: retq %mask = icmp slt <16 x i32> %a, zeroinitializer %1 = sitofp <16 x i1> %mask to <16 x float> ret <16 x float> %1 @@ -2195,12 +1436,6 @@ define <16 x float> @scto16f32(<16 x i8> %a) { ; ALL-NEXT: vpmovsxbd %xmm0, %zmm0 ; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: scto16f32: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpmovsxbd %xmm0, %zmm0 -; KNL_WIDEN-NEXT: vcvtdq2ps %zmm0, %zmm0 -; KNL_WIDEN-NEXT: retq %1 = sitofp <16 x i8> %a to <16 x float> ret <16 x float> %1 } @@ -2211,12 +1446,6 @@ define <16 x float> @ssto16f32(<16 x i16> %a) { ; ALL-NEXT: vpmovsxwd %ymm0, %zmm0 ; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: ssto16f32: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL_WIDEN-NEXT: vcvtdq2ps %zmm0, %zmm0 -; KNL_WIDEN-NEXT: retq %1 = sitofp <16 x i16> %a to <16 x float> ret <16 x float> %1 } @@ -2227,12 +1456,6 @@ define <8 x double> @ssto16f64(<8 x i16> %a) { ; ALL-NEXT: vpmovsxwd %xmm0, %ymm0 ; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: ssto16f64: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpmovsxwd %xmm0, %ymm0 -; KNL_WIDEN-NEXT: vcvtdq2pd %ymm0, %zmm0 -; KNL_WIDEN-NEXT: retq %1 = sitofp <8 x i16> %a to <8 x double> ret <8 x double> %1 } @@ -2245,12 +1468,6 @@ define <8 x double> @scto8f64(<8 x i8> %a) { ; ALL-NEXT: vpsrad $24, %ymm0, %ymm0 ; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: scto8f64: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpmovsxbd %xmm0, %ymm0 -; KNL_WIDEN-NEXT: vcvtdq2pd %ymm0, %zmm0 -; KNL_WIDEN-NEXT: retq %1 = sitofp <8 x i8> %a to <8 x double> ret <8 x double> %1 } @@ -2263,14 +1480,6 @@ define <16 x double> @scto16f64(<16 x i8> %a) { ; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: scto16f64: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpmovsxbd %xmm0, %zmm1 -; KNL_WIDEN-NEXT: vcvtdq2pd %ymm1, %zmm0 -; KNL_WIDEN-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; KNL_WIDEN-NEXT: vcvtdq2pd %ymm1, %zmm1 -; KNL_WIDEN-NEXT: retq %b = sitofp <16 x i8> %a to <16 x double> ret <16 x double> %b } @@ -2311,18 +1520,6 @@ define <16 x double> @sbto16f64(<16 x double> %a) { ; DQNOVL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; DQNOVL-NEXT: vcvtdq2pd %ymm1, %zmm1 ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: sbto16f64: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; KNL_WIDEN-NEXT: vcmpltpd %zmm0, %zmm2, %k0 -; KNL_WIDEN-NEXT: vcmpltpd %zmm1, %zmm2, %k1 -; KNL_WIDEN-NEXT: kunpckbw %k0, %k1, %k1 -; KNL_WIDEN-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; KNL_WIDEN-NEXT: vcvtdq2pd %ymm1, %zmm0 -; KNL_WIDEN-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; KNL_WIDEN-NEXT: vcvtdq2pd %ymm1, %zmm1 -; KNL_WIDEN-NEXT: retq %cmpres = fcmp ogt <16 x double> %a, zeroinitializer %1 = sitofp <16 x i1> %cmpres to <16 x double> ret <16 x double> %1 @@ -2361,14 +1558,6 @@ define <8 x double> @sbto8f64(<8 x double> %a) { ; DQNOVL-NEXT: vpmovm2d %k0, %zmm0 ; DQNOVL-NEXT: vcvtdq2pd %ymm0, %zmm0 ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: sbto8f64: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; KNL_WIDEN-NEXT: vcmpltpd %zmm0, %zmm1, %k1 -; KNL_WIDEN-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL_WIDEN-NEXT: vcvtdq2pd %ymm0, %zmm0 -; KNL_WIDEN-NEXT: retq %cmpres = fcmp ogt <8 x double> %a, zeroinitializer %1 = sitofp <8 x i1> %cmpres to <8 x double> ret <8 x double> %1 @@ -2381,13 +1570,6 @@ define <8 x float> @sbto8f32(<8 x float> %a) { ; ALL-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; ALL-NEXT: vcvtdq2ps %ymm0, %ymm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: sbto8f32: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; KNL_WIDEN-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; KNL_WIDEN-NEXT: vcvtdq2ps %ymm0, %ymm0 -; KNL_WIDEN-NEXT: retq %cmpres = fcmp ogt <8 x float> %a, zeroinitializer %1 = sitofp <8 x i1> %cmpres to <8 x float> ret <8 x float> %1 @@ -2400,13 +1582,6 @@ define <4 x float> @sbto4f32(<4 x float> %a) { ; ALL-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 ; ALL-NEXT: vcvtdq2ps %xmm0, %xmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: sbto4f32: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; KNL_WIDEN-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 -; KNL_WIDEN-NEXT: vcvtdq2ps %xmm0, %xmm0 -; KNL_WIDEN-NEXT: retq %cmpres = fcmp ogt <4 x float> %a, zeroinitializer %1 = sitofp <4 x i1> %cmpres to <4 x float> ret <4 x float> %1 @@ -2437,14 +1612,6 @@ define <4 x double> @sbto4f64(<4 x double> %a) { ; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; VLNODQ-NEXT: vcvtdq2pd %xmm0, %ymm0 ; VLNODQ-NEXT: retq -; -; KNL_WIDEN-LABEL: sbto4f64: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; KNL_WIDEN-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; KNL_WIDEN-NEXT: vpmovqd %zmm0, %ymm0 -; KNL_WIDEN-NEXT: vcvtdq2pd %xmm0, %ymm0 -; KNL_WIDEN-NEXT: retq %cmpres = fcmp ogt <4 x double> %a, zeroinitializer %1 = sitofp <4 x i1> %cmpres to <4 x double> ret <4 x double> %1 @@ -2457,13 +1624,6 @@ define <2 x float> @sbto2f32(<2 x float> %a) { ; ALL-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 ; ALL-NEXT: vcvtdq2ps %xmm0, %xmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: sbto2f32: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; KNL_WIDEN-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 -; KNL_WIDEN-NEXT: vcvtdq2ps %xmm0, %xmm0 -; KNL_WIDEN-NEXT: retq %cmpres = fcmp ogt <2 x float> %a, zeroinitializer %1 = sitofp <2 x i1> %cmpres to <2 x float> ret <2 x float> %1 @@ -2477,15 +1637,6 @@ define <2 x double> @sbto2f64(<2 x double> %a) { ; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; ALL-NEXT: vcvtdq2pd %xmm0, %xmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: sbto2f64: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; KNL_WIDEN-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; KNL_WIDEN-NEXT: vpmovqd %zmm0, %ymm0 -; KNL_WIDEN-NEXT: vcvtdq2pd %xmm0, %xmm0 -; KNL_WIDEN-NEXT: vzeroupper -; KNL_WIDEN-NEXT: retq %cmpres = fcmp ogt <2 x double> %a, zeroinitializer %1 = sitofp <2 x i1> %cmpres to <2 x double> ret <2 x double> %1 @@ -2497,12 +1648,6 @@ define <16 x float> @ucto16f32(<16 x i8> %a) { ; ALL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: ucto16f32: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; KNL_WIDEN-NEXT: vcvtdq2ps %zmm0, %zmm0 -; KNL_WIDEN-NEXT: retq %b = uitofp <16 x i8> %a to <16 x float> ret <16 x float>%b } @@ -2514,12 +1659,6 @@ define <8 x double> @ucto8f64(<8 x i8> %a) { ; ALL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: ucto8f64: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; KNL_WIDEN-NEXT: vcvtdq2pd %ymm0, %zmm0 -; KNL_WIDEN-NEXT: retq %b = uitofp <8 x i8> %a to <8 x double> ret <8 x double> %b } @@ -2530,12 +1669,6 @@ define <16 x float> @swto16f32(<16 x i16> %a) { ; ALL-NEXT: vpmovsxwd %ymm0, %zmm0 ; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: swto16f32: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL_WIDEN-NEXT: vcvtdq2ps %zmm0, %zmm0 -; KNL_WIDEN-NEXT: retq %b = sitofp <16 x i16> %a to <16 x float> ret <16 x float> %b } @@ -2546,12 +1679,6 @@ define <8 x double> @swto8f64(<8 x i16> %a) { ; ALL-NEXT: vpmovsxwd %xmm0, %ymm0 ; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: swto8f64: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpmovsxwd %xmm0, %ymm0 -; KNL_WIDEN-NEXT: vcvtdq2pd %ymm0, %zmm0 -; KNL_WIDEN-NEXT: retq %b = sitofp <8 x i16> %a to <8 x double> ret <8 x double> %b } @@ -2564,14 +1691,6 @@ define <16 x double> @swto16f64(<16 x i16> %a) { ; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: swto16f64: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpmovsxwd %ymm0, %zmm1 -; KNL_WIDEN-NEXT: vcvtdq2pd %ymm1, %zmm0 -; KNL_WIDEN-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; KNL_WIDEN-NEXT: vcvtdq2pd %ymm1, %zmm1 -; KNL_WIDEN-NEXT: retq %b = sitofp <16 x i16> %a to <16 x double> ret <16 x double> %b } @@ -2584,14 +1703,6 @@ define <16 x double> @ucto16f64(<16 x i8> %a) { ; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: ucto16f64: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; KNL_WIDEN-NEXT: vcvtdq2pd %ymm1, %zmm0 -; KNL_WIDEN-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; KNL_WIDEN-NEXT: vcvtdq2pd %ymm1, %zmm1 -; KNL_WIDEN-NEXT: retq %b = uitofp <16 x i8> %a to <16 x double> ret <16 x double> %b } @@ -2602,12 +1713,6 @@ define <16 x float> @uwto16f32(<16 x i16> %a) { ; ALL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: uwto16f32: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; KNL_WIDEN-NEXT: vcvtdq2ps %zmm0, %zmm0 -; KNL_WIDEN-NEXT: retq %b = uitofp <16 x i16> %a to <16 x float> ret <16 x float> %b } @@ -2618,12 +1723,6 @@ define <8 x double> @uwto8f64(<8 x i16> %a) { ; ALL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: uwto8f64: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; KNL_WIDEN-NEXT: vcvtdq2pd %ymm0, %zmm0 -; KNL_WIDEN-NEXT: retq %b = uitofp <8 x i16> %a to <8 x double> ret <8 x double> %b } @@ -2636,14 +1735,6 @@ define <16 x double> @uwto16f64(<16 x i16> %a) { ; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: uwto16f64: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; KNL_WIDEN-NEXT: vcvtdq2pd %ymm1, %zmm0 -; KNL_WIDEN-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; KNL_WIDEN-NEXT: vcvtdq2pd %ymm1, %zmm1 -; KNL_WIDEN-NEXT: retq %b = uitofp <16 x i16> %a to <16 x double> ret <16 x double> %b } @@ -2653,11 +1744,6 @@ define <16 x float> @sito16f32(<16 x i32> %a) { ; ALL: # %bb.0: ; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: sito16f32: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvtdq2ps %zmm0, %zmm0 -; KNL_WIDEN-NEXT: retq %b = sitofp <16 x i32> %a to <16 x float> ret <16 x float> %b } @@ -2670,14 +1756,6 @@ define <16 x double> @sito16f64(<16 x i32> %a) { ; ALL-NEXT: vcvtdq2pd %ymm0, %zmm1 ; ALL-NEXT: vmovaps %zmm2, %zmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: sito16f64: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvtdq2pd %ymm0, %zmm2 -; KNL_WIDEN-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; KNL_WIDEN-NEXT: vcvtdq2pd %ymm0, %zmm1 -; KNL_WIDEN-NEXT: vmovaps %zmm2, %zmm0 -; KNL_WIDEN-NEXT: retq %b = sitofp <16 x i32> %a to <16 x double> ret <16 x double> %b } @@ -2688,12 +1766,6 @@ define <16 x float> @usto16f32(<16 x i16> %a) { ; ALL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: usto16f32: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; KNL_WIDEN-NEXT: vcvtdq2ps %zmm0, %zmm0 -; KNL_WIDEN-NEXT: retq %b = uitofp <16 x i16> %a to <16 x float> ret <16 x float> %b } @@ -2723,15 +1795,6 @@ define <16 x float> @ubto16f32(<16 x i32> %a) { ; DQNOVL-NEXT: vpsrld $31, %zmm0, %zmm0 ; DQNOVL-NEXT: vcvtdq2ps %zmm0, %zmm0 ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: ubto16f32: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL_WIDEN-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; KNL_WIDEN-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL_WIDEN-NEXT: vpsrld $31, %zmm0, %zmm0 -; KNL_WIDEN-NEXT: vcvtdq2ps %zmm0, %zmm0 -; KNL_WIDEN-NEXT: retq %mask = icmp slt <16 x i32> %a, zeroinitializer %1 = uitofp <16 x i1> %mask to <16 x float> ret <16 x float> %1 @@ -2768,17 +1831,6 @@ define <16 x double> @ubto16f64(<16 x i32> %a) { ; DQNOVL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; DQNOVL-NEXT: vcvtdq2pd %ymm1, %zmm1 ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: ubto16f64: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL_WIDEN-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; KNL_WIDEN-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL_WIDEN-NEXT: vpsrld $31, %zmm0, %zmm1 -; KNL_WIDEN-NEXT: vcvtdq2pd %ymm1, %zmm0 -; KNL_WIDEN-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; KNL_WIDEN-NEXT: vcvtdq2pd %ymm1, %zmm1 -; KNL_WIDEN-NEXT: retq %mask = icmp slt <16 x i32> %a, zeroinitializer %1 = uitofp <16 x i1> %mask to <16 x double> ret <16 x double> %1 @@ -2799,14 +1851,6 @@ define <8 x float> @ubto8f32(<8 x i32> %a) { ; VL-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 ; VL-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm0, %ymm0 ; VL-NEXT: retq -; -; KNL_WIDEN-LABEL: ubto8f32: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL_WIDEN-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 -; KNL_WIDEN-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1065353216,1065353216,1065353216,1065353216,1065353216,1065353216,1065353216,1065353216] -; KNL_WIDEN-NEXT: vpand %ymm1, %ymm0, %ymm0 -; KNL_WIDEN-NEXT: retq %mask = icmp slt <8 x i32> %a, zeroinitializer %1 = uitofp <8 x i1> %mask to <8 x float> ret <8 x float> %1 @@ -2820,14 +1864,6 @@ define <8 x double> @ubto8f64(<8 x i32> %a) { ; ALL-NEXT: vpsrld $31, %ymm0, %ymm0 ; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: ubto8f64: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL_WIDEN-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 -; KNL_WIDEN-NEXT: vpsrld $31, %ymm0, %ymm0 -; KNL_WIDEN-NEXT: vcvtdq2pd %ymm0, %zmm0 -; KNL_WIDEN-NEXT: retq %mask = icmp slt <8 x i32> %a, zeroinitializer %1 = uitofp <8 x i1> %mask to <8 x double> ret <8 x double> %1 @@ -2848,14 +1884,6 @@ define <4 x float> @ubto4f32(<4 x i32> %a) { ; VL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0 ; VL-NEXT: retq -; -; KNL_WIDEN-LABEL: ubto4f32: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL_WIDEN-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; KNL_WIDEN-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216] -; KNL_WIDEN-NEXT: vpand %xmm1, %xmm0, %xmm0 -; KNL_WIDEN-NEXT: retq %mask = icmp slt <4 x i32> %a, zeroinitializer %1 = uitofp <4 x i1> %mask to <4 x float> ret <4 x float> %1 @@ -2869,14 +1897,6 @@ define <4 x double> @ubto4f64(<4 x i32> %a) { ; ALL-NEXT: vpsrld $31, %xmm0, %xmm0 ; ALL-NEXT: vcvtdq2pd %xmm0, %ymm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: ubto4f64: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL_WIDEN-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; KNL_WIDEN-NEXT: vpsrld $31, %xmm0, %xmm0 -; KNL_WIDEN-NEXT: vcvtdq2pd %xmm0, %ymm0 -; KNL_WIDEN-NEXT: retq %mask = icmp slt <4 x i32> %a, zeroinitializer %1 = uitofp <4 x i1> %mask to <4 x double> ret <4 x double> %1 @@ -2891,14 +1911,6 @@ define <2 x float> @ubto2f32(<2 x i32> %a) { ; ALL-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 ; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: ubto2f32: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL_WIDEN-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; KNL_WIDEN-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216] -; KNL_WIDEN-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; KNL_WIDEN-NEXT: retq %mask = icmp ne <2 x i32> %a, zeroinitializer %1 = uitofp <2 x i1> %mask to <2 x float> ret <2 x float> %1 @@ -2914,15 +1926,6 @@ define <2 x double> @ubto2f64(<2 x i32> %a) { ; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; ALL-NEXT: vcvtdq2pd %xmm0, %xmm0 ; ALL-NEXT: retq -; -; KNL_WIDEN-LABEL: ubto2f64: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL_WIDEN-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; KNL_WIDEN-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; KNL_WIDEN-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; KNL_WIDEN-NEXT: vcvtdq2pd %xmm0, %xmm0 -; KNL_WIDEN-NEXT: retq %mask = icmp ne <2 x i32> %a, zeroinitializer %1 = uitofp <2 x i1> %mask to <2 x double> ret <2 x double> %1 @@ -2968,18 +1971,6 @@ define <2 x i64> @test_2f64toub(<2 x double> %a, <2 x i64> %passthru) { ; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; DQNOVL-NEXT: vzeroupper ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: test_2f64toub: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL_WIDEN-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL_WIDEN-NEXT: vcvttpd2udq %zmm0, %ymm0 -; KNL_WIDEN-NEXT: vpslld $31, %ymm0, %ymm0 -; KNL_WIDEN-NEXT: vptestmd %zmm0, %zmm0, %k1 -; KNL_WIDEN-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; KNL_WIDEN-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; KNL_WIDEN-NEXT: vzeroupper -; KNL_WIDEN-NEXT: retq %mask = fptoui <2 x double> %a to <2 x i1> %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer ret <2 x i64> %select @@ -3021,16 +2012,6 @@ define <4 x i64> @test_4f64toub(<4 x double> %a, <4 x i64> %passthru) { ; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: test_4f64toub: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; KNL_WIDEN-NEXT: vcvttpd2dq %ymm0, %xmm0 -; KNL_WIDEN-NEXT: vpslld $31, %xmm0, %xmm0 -; KNL_WIDEN-NEXT: vptestmd %zmm0, %zmm0, %k1 -; KNL_WIDEN-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; KNL_WIDEN-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; KNL_WIDEN-NEXT: retq %mask = fptoui <4 x double> %a to <4 x i1> %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer ret <4 x i64> %select @@ -3068,14 +2049,6 @@ define <8 x i64> @test_8f64toub(<8 x double> %a, <8 x i64> %passthru) { ; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 ; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: test_8f64toub: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvttpd2dq %zmm0, %ymm0 -; KNL_WIDEN-NEXT: vpslld $31, %ymm0, %ymm0 -; KNL_WIDEN-NEXT: vptestmd %zmm0, %zmm0, %k1 -; KNL_WIDEN-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; KNL_WIDEN-NEXT: retq %mask = fptoui <8 x double> %a to <8 x i1> %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer ret <8 x i64> %select @@ -3119,17 +2092,6 @@ define <2 x i64> @test_2f32toub(<2 x float> %a, <2 x i64> %passthru) { ; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; DQNOVL-NEXT: vzeroupper ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: test_2f32toub: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL_WIDEN-NEXT: vcvttps2dq %xmm0, %xmm0 -; KNL_WIDEN-NEXT: vpslld $31, %xmm0, %xmm0 -; KNL_WIDEN-NEXT: vptestmd %zmm0, %zmm0, %k1 -; KNL_WIDEN-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; KNL_WIDEN-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; KNL_WIDEN-NEXT: vzeroupper -; KNL_WIDEN-NEXT: retq %mask = fptoui <2 x float> %a to <2 x i1> %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer ret <2 x i64> %select @@ -3171,16 +2133,6 @@ define <4 x i64> @test_4f32toub(<4 x float> %a, <4 x i64> %passthru) { ; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: test_4f32toub: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; KNL_WIDEN-NEXT: vcvttps2dq %xmm0, %xmm0 -; KNL_WIDEN-NEXT: vpslld $31, %xmm0, %xmm0 -; KNL_WIDEN-NEXT: vptestmd %zmm0, %zmm0, %k1 -; KNL_WIDEN-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; KNL_WIDEN-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; KNL_WIDEN-NEXT: retq %mask = fptoui <4 x float> %a to <4 x i1> %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer ret <4 x i64> %select @@ -3218,14 +2170,6 @@ define <8 x i64> @test_8f32toub(<8 x float> %a, <8 x i64> %passthru) { ; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 ; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: test_8f32toub: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvttps2dq %ymm0, %ymm0 -; KNL_WIDEN-NEXT: vpslld $31, %ymm0, %ymm0 -; KNL_WIDEN-NEXT: vptestmd %zmm0, %zmm0, %k1 -; KNL_WIDEN-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; KNL_WIDEN-NEXT: retq %mask = fptoui <8 x float> %a to <8 x i1> %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer ret <8 x i64> %select @@ -3255,14 +2199,6 @@ define <16 x i32> @test_16f32toub(<16 x float> %a, <16 x i32> %passthru) { ; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 ; DQNOVL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: test_16f32toub: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvttps2dq %zmm0, %zmm0 -; KNL_WIDEN-NEXT: vpslld $31, %zmm0, %zmm0 -; KNL_WIDEN-NEXT: vptestmd %zmm0, %zmm0, %k1 -; KNL_WIDEN-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} -; KNL_WIDEN-NEXT: retq %mask = fptoui <16 x float> %a to <16 x i1> %select = select <16 x i1> %mask, <16 x i32> %passthru, <16 x i32> zeroinitializer ret <16 x i32> %select @@ -3306,17 +2242,6 @@ define <2 x i64> @test_2f64tosb(<2 x double> %a, <2 x i64> %passthru) { ; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; DQNOVL-NEXT: vzeroupper ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: test_2f64tosb: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL_WIDEN-NEXT: vcvttpd2dq %xmm0, %xmm0 -; KNL_WIDEN-NEXT: vpslld $31, %xmm0, %xmm0 -; KNL_WIDEN-NEXT: vptestmd %zmm0, %zmm0, %k1 -; KNL_WIDEN-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; KNL_WIDEN-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; KNL_WIDEN-NEXT: vzeroupper -; KNL_WIDEN-NEXT: retq %mask = fptosi <2 x double> %a to <2 x i1> %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer ret <2 x i64> %select @@ -3354,15 +2279,6 @@ define <4 x i64> @test_4f64tosb(<4 x double> %a, <4 x i64> %passthru) { ; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: test_4f64tosb: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; KNL_WIDEN-NEXT: vcvttpd2dq %ymm0, %xmm0 -; KNL_WIDEN-NEXT: vptestmd %zmm0, %zmm0, %k1 -; KNL_WIDEN-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; KNL_WIDEN-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; KNL_WIDEN-NEXT: retq %mask = fptosi <4 x double> %a to <4 x i1> %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer ret <4 x i64> %select @@ -3396,13 +2312,6 @@ define <8 x i64> @test_8f64tosb(<8 x double> %a, <8 x i64> %passthru) { ; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 ; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: test_8f64tosb: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvttpd2dq %zmm0, %ymm0 -; KNL_WIDEN-NEXT: vptestmd %zmm0, %zmm0, %k1 -; KNL_WIDEN-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; KNL_WIDEN-NEXT: retq %mask = fptosi <8 x double> %a to <8 x i1> %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer ret <8 x i64> %select @@ -3442,16 +2351,6 @@ define <2 x i64> @test_2f32tosb(<2 x float> %a, <2 x i64> %passthru) { ; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; DQNOVL-NEXT: vzeroupper ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: test_2f32tosb: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL_WIDEN-NEXT: vcvttps2dq %xmm0, %xmm0 -; KNL_WIDEN-NEXT: vptestmd %zmm0, %zmm0, %k1 -; KNL_WIDEN-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; KNL_WIDEN-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; KNL_WIDEN-NEXT: vzeroupper -; KNL_WIDEN-NEXT: retq %mask = fptosi <2 x float> %a to <2 x i1> %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer ret <2 x i64> %select @@ -3489,15 +2388,6 @@ define <4 x i64> @test_4f32tosb(<4 x float> %a, <4 x i64> %passthru) { ; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: test_4f32tosb: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; KNL_WIDEN-NEXT: vcvttps2dq %xmm0, %xmm0 -; KNL_WIDEN-NEXT: vptestmd %zmm0, %zmm0, %k1 -; KNL_WIDEN-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; KNL_WIDEN-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; KNL_WIDEN-NEXT: retq %mask = fptosi <4 x float> %a to <4 x i1> %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer ret <4 x i64> %select @@ -3531,13 +2421,6 @@ define <8 x i64> @test_8f32tosb(<8 x float> %a, <8 x i64> %passthru) { ; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 ; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: test_8f32tosb: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvttps2dq %ymm0, %ymm0 -; KNL_WIDEN-NEXT: vptestmd %zmm0, %zmm0, %k1 -; KNL_WIDEN-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; KNL_WIDEN-NEXT: retq %mask = fptosi <8 x float> %a to <8 x i1> %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer ret <8 x i64> %select @@ -3564,13 +2447,6 @@ define <16 x i32> @test_16f32tosb(<16 x float> %a, <16 x i32> %passthru) { ; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 ; DQNOVL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} ; DQNOVL-NEXT: retq -; -; KNL_WIDEN-LABEL: test_16f32tosb: -; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vcvttps2dq %zmm0, %zmm0 -; KNL_WIDEN-NEXT: vptestmd %zmm0, %zmm0, %k1 -; KNL_WIDEN-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} -; KNL_WIDEN-NEXT: retq %mask = fptosi <16 x float> %a to <16 x i1> %select = select <16 x i1> %mask, <16 x i32> %passthru, <16 x i32> zeroinitializer ret <16 x i32> %select diff --git a/llvm/test/CodeGen/X86/shrink_vmul-widen.ll b/llvm/test/CodeGen/X86/shrink_vmul-widen.ll new file mode 100644 index 000000000000..a93de33dea77 --- /dev/null +++ b/llvm/test/CodeGen/X86/shrink_vmul-widen.ll @@ -0,0 +1,2730 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 + +@c = external global i32*, align 8 + +; %val1 = load <2 x i8> +; %op1 = zext<2 x i32> %val1 +; %val2 = load <2 x i8> +; %op2 = zext<2 x i32> %val2 +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { +; X86-SSE-LABEL: mul_2xi8: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx +; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax +; X86-SSE-NEXT: movd %eax, %xmm1 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: movd %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE-NEXT: movd %xmm0, 4(%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi8: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %edi +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: movzbl 1(%edx,%ecx), %edi +; X86-AVX-NEXT: movzbl (%edx,%ecx), %edx +; X86-AVX-NEXT: vmovd %edx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 +; X86-AVX-NEXT: movzbl 1(%eax,%ecx), %edx +; X86-AVX-NEXT: movzbl (%eax,%ecx), %eax +; X86-AVX-NEXT: vmovd %eax, %xmm1 +; X86-AVX-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%esi,%ecx,4) +; X86-AVX-NEXT: vmovd %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: popl %edi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm1 +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: movzbl 1(%rdi,%rdx), %ecx +; X64-AVX-NEXT: movzbl (%rdi,%rdx), %edi +; X64-AVX-NEXT: vmovd %edi, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: movzbl 1(%rsi,%rdx), %ecx +; X64-AVX-NEXT: movzbl (%rsi,%rdx), %esi +; X64-AVX-NEXT: vmovd %esi, %xmm1 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = zext <2 x i8> %wide.load to <2 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <2 x i8>* + %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 + %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <4 x i8> +; %op1 = zext<4 x i32> %val1 +; %val2 = load <4 x i8> +; %op2 = zext<4 x i32> %val2 +; %rst = mul <4 x i32> %op1, %op2 +; +define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { +; X86-SSE-LABEL: mul_4xi8: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X86-SSE-NEXT: pmaddwd %xmm0, %xmm2 +; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_4xi8: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_4xi8: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X64-SSE-NEXT: pmaddwd %xmm0, %xmm2 +; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_4xi8: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <4 x i8>* + %wide.load = load <4 x i8>, <4 x i8>* %tmp7, align 1 + %tmp8 = zext <4 x i8> %wide.load to <4 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <4 x i8>* + %wide.load17 = load <4 x i8>, <4 x i8>* %tmp11, align 1 + %tmp12 = zext <4 x i8> %wide.load17 to <4 x i32> + %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <4 x i32>* + store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <8 x i8> +; %op1 = zext<8 x i32> %val1 +; %val2 = load <8 x i8> +; %op2 = zext<8 x i32> %val2 +; %rst = mul <8 x i32> %op1, %op2 +; +define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { +; X86-SSE-LABEL: mul_8xi8: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, 16(%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: mul_8xi8: +; X86-AVX1: # %bb.0: # %entry +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: movl c, %esi +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0 +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX1-NEXT: vmovups %ymm0, (%esi,%ecx,4) +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: mul_8xi8: +; X86-AVX2: # %bb.0: # %entry +; X86-AVX2-NEXT: pushl %esi +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX2-NEXT: movl c, %esi +; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4) +; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: mul_8xi8: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: mul_8xi8: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0 +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups %ymm0, (%rax,%rdx,4) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: mul_8xi8: +; X64-AVX2: # %bb.0: # %entry +; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <8 x i8>* + %wide.load = load <8 x i8>, <8 x i8>* %tmp7, align 1 + %tmp8 = zext <8 x i8> %wide.load to <8 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <8 x i8>* + %wide.load17 = load <8 x i8>, <8 x i8>* %tmp11, align 1 + %tmp12 = zext <8 x i8> %wide.load17 to <8 x i32> + %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <8 x i32>* + store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <16 x i8> +; %op1 = zext<16 x i32> %val1 +; %val2 = load <16 x i8> +; %op2 = zext<16 x i32> %val2 +; %rst = mul <16 x i32> %op1, %op2 +; +define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { +; X86-SSE-LABEL: mul_16xi8: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 +; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; X86-SSE-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X86-SSE-NEXT: pmullw %xmm3, %xmm4 +; X86-SSE-NEXT: movdqa %xmm4, %xmm3 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, 32(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm4, 16(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm3, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: mul_16xi8: +; X86-AVX1: # %bb.0: # %entry +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: movl c, %esi +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4) +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: mul_16xi8: +; X86-AVX2: # %bb.0: # %entry +; X86-AVX2-NEXT: pushl %esi +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX2-NEXT: movl c, %esi +; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0 +; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 +; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) +; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) +; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: mul_16xi8: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 +; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1 +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; X64-SSE-NEXT: movdqa %xmm1, %xmm4 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X64-SSE-NEXT: pmullw %xmm3, %xmm4 +; X64-SSE-NEXT: movdqa %xmm4, %xmm3 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, 32(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm4, 16(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm3, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: mul_16xi8: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0 +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm4, %xmm1 +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: mul_16xi8: +; X64-AVX2: # %bb.0: # %entry +; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0 +; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 +; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) +; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <16 x i8>* + %wide.load = load <16 x i8>, <16 x i8>* %tmp7, align 1 + %tmp8 = zext <16 x i8> %wide.load to <16 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <16 x i8>* + %wide.load17 = load <16 x i8>, <16 x i8>* %tmp11, align 1 + %tmp12 = zext <16 x i8> %wide.load17 to <16 x i32> + %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <16 x i32>* + store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <2 x i16> +; %op1 = zext<2 x i32> %val1 +; %val2 = load <2 x i16> +; %op2 = zext<2 x i32> %val2 +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { +; X86-SSE-LABEL: mul_2xi16: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2 +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: movd %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE-NEXT: movd %xmm0, 4(%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi16: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%esi,%ecx,4) +; X86-AVX-NEXT: vmovd %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2 +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 + %tmp8 = zext <2 x i16> %wide.load to <2 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <2 x i16>* + %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 + %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <4 x i16> +; %op1 = zext<4 x i32> %val1 +; %val2 = load <4 x i16> +; %op2 = zext<4 x i32> %val2 +; %rst = mul <4 x i32> %op1, %op2 +; +define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { +; X86-SSE-LABEL: mul_4xi16: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2 +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_4xi16: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_4xi16: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X64-SSE-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2 +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_4xi16: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <4 x i16>* + %wide.load = load <4 x i16>, <4 x i16>* %tmp7, align 1 + %tmp8 = zext <4 x i16> %wide.load to <4 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <4 x i16>* + %wide.load17 = load <4 x i16>, <4 x i16>* %tmp11, align 1 + %tmp12 = zext <4 x i16> %wide.load17 to <4 x i32> + %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <4 x i32>* + store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <8 x i16> +; %op1 = zext<8 x i32> %val1 +; %val2 = load <8 x i16> +; %op2 = zext<8 x i32> %val2 +; %rst = mul <8 x i32> %op1, %op2 +; +define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { +; X86-SSE-LABEL: mul_8xi16: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 +; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1 +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2 +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: mul_8xi16: +; X86-AVX1: # %bb.0: # %entry +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: movl c, %esi +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX1-NEXT: vmovups %ymm0, (%esi,%ecx,4) +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: mul_8xi16: +; X86-AVX2: # %bb.0: # %entry +; X86-AVX2-NEXT: pushl %esi +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX2-NEXT: movl c, %esi +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4) +; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: mul_8xi16: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 +; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1 +; X64-SSE-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2 +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: mul_8xi16: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups %ymm0, (%rax,%rdx,4) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: mul_8xi16: +; X64-AVX2: # %bb.0: # %entry +; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <8 x i16>* + %wide.load = load <8 x i16>, <8 x i16>* %tmp7, align 1 + %tmp8 = zext <8 x i16> %wide.load to <8 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <8 x i16>* + %wide.load17 = load <8 x i16>, <8 x i16>* %tmp11, align 1 + %tmp12 = zext <8 x i16> %wide.load17 to <8 x i32> + %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <8 x i32>* + store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <16 x i16> +; %op1 = zext<16 x i32> %val1 +; %val2 = load <16 x i16> +; %op2 = zext<16 x i32> %val2 +; %rst = mul <16 x i32> %op1, %op2 +; +define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { +; X86-SSE-LABEL: mul_16xi16: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 +; X86-SSE-NEXT: movdqu 16(%edx,%ecx), %xmm1 +; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm2 +; X86-SSE-NEXT: movdqu 16(%eax,%ecx), %xmm3 +; X86-SSE-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE-NEXT: pmulhuw %xmm0, %xmm4 +; X86-SSE-NEXT: pmullw %xmm0, %xmm2 +; X86-SSE-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X86-SSE-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE-NEXT: pmulhuw %xmm1, %xmm4 +; X86-SSE-NEXT: pmullw %xmm1, %xmm3 +; X86-SSE-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X86-SSE-NEXT: movdqu %xmm3, 48(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm1, 32(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm2, 16(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: mul_16xi16: +; X86-AVX1: # %bb.0: # %entry +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: movl c, %esi +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4) +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: mul_16xi16: +; X86-AVX2: # %bb.0: # %entry +; X86-AVX2-NEXT: pushl %esi +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX2-NEXT: movl c, %esi +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) +; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) +; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: mul_16xi16: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 +; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1 +; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2 +; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3 +; X64-SSE-NEXT: movdqa %xmm2, %xmm4 +; X64-SSE-NEXT: pmulhuw %xmm0, %xmm4 +; X64-SSE-NEXT: pmullw %xmm0, %xmm2 +; X64-SSE-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X64-SSE-NEXT: movdqa %xmm3, %xmm4 +; X64-SSE-NEXT: pmulhuw %xmm1, %xmm4 +; X64-SSE-NEXT: pmullw %xmm1, %xmm3 +; X64-SSE-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: mul_16xi16: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: mul_16xi16: +; X64-AVX2: # %bb.0: # %entry +; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) +; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <16 x i16>* + %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1 + %tmp8 = zext <16 x i16> %wide.load to <16 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <16 x i16>* + %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1 + %tmp12 = zext <16 x i16> %wide.load17 to <16 x i32> + %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <16 x i32>* + store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <2 x i8> +; %op1 = sext<2 x i32> %val1 +; %val2 = load <2 x i8> +; %op2 = sext<2 x i32> %val2 +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { +; X86-SSE-LABEL: mul_2xi8_sext: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx +; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax +; X86-SSE-NEXT: movd %eax, %xmm1 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm1 +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: psrad $16, %xmm0 +; X86-SSE-NEXT: movd %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE-NEXT: movd %xmm0, 4(%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi8_sext: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %edi +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: movsbl 1(%edx,%ecx), %edi +; X86-AVX-NEXT: movsbl (%edx,%ecx), %edx +; X86-AVX-NEXT: vmovd %edx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 +; X86-AVX-NEXT: movsbl 1(%eax,%ecx), %edx +; X86-AVX-NEXT: movsbl (%eax,%ecx), %eax +; X86-AVX-NEXT: vmovd %eax, %xmm1 +; X86-AVX-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%esi,%ecx,4) +; X86-AVX-NEXT: vmovd %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: popl %edi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_sext: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm1 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm1 +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-SSE-NEXT: psrad $16, %xmm0 +; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_sext: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: movsbl 1(%rdi,%rdx), %ecx +; X64-AVX-NEXT: movsbl (%rdi,%rdx), %edi +; X64-AVX-NEXT: vmovd %edi, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: movsbl 1(%rsi,%rdx), %ecx +; X64-AVX-NEXT: movsbl (%rsi,%rdx), %esi +; X64-AVX-NEXT: vmovd %esi, %xmm1 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = sext <2 x i8> %wide.load to <2 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <2 x i8>* + %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 + %tmp12 = sext <2 x i8> %wide.load17 to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <2 x i8> +; %op1 = sext<2 x i32> %val1 +; %val2 = load <2 x i8> +; %op2 = zext<2 x i32> %val2 +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { +; X86-SSE-LABEL: mul_2xi8_sext_zext: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx +; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax +; X86-SSE-NEXT: movd %eax, %xmm1 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm0, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movd %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE-NEXT: movd %xmm0, 4(%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi8_sext_zext: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %edi +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: movsbl 1(%edx,%ecx), %edi +; X86-AVX-NEXT: movsbl (%edx,%ecx), %edx +; X86-AVX-NEXT: vmovd %edx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 +; X86-AVX-NEXT: movzbl 1(%eax,%ecx), %edx +; X86-AVX-NEXT: movzbl (%eax,%ecx), %eax +; X86-AVX-NEXT: vmovd %eax, %xmm1 +; X86-AVX-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%esi,%ecx,4) +; X86-AVX-NEXT: vmovd %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: popl %edi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_sext_zext: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm1 +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm0, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_sext_zext: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: movsbl 1(%rdi,%rdx), %ecx +; X64-AVX-NEXT: movsbl (%rdi,%rdx), %edi +; X64-AVX-NEXT: vmovd %edi, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: movzbl 1(%rsi,%rdx), %ecx +; X64-AVX-NEXT: movzbl (%rsi,%rdx), %esi +; X64-AVX-NEXT: vmovd %esi, %xmm1 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = sext <2 x i8> %wide.load to <2 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <2 x i8>* + %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 + %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <2 x i16> +; %op1 = sext<2 x i32> %val1 +; %val2 = load <2 x i16> +; %op2 = sext<2 x i32> %val2 +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { +; X86-SSE-LABEL: mul_2xi16_sext: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm0, %xmm2 +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: movd %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE-NEXT: movd %xmm0, 4(%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi16_sext: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %edi +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: movswl 2(%edx,%ecx), %edi +; X86-AVX-NEXT: movswl (%edx,%ecx), %edx +; X86-AVX-NEXT: vmovd %edx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 +; X86-AVX-NEXT: movswl 2(%eax,%ecx), %edx +; X86-AVX-NEXT: movswl (%eax,%ecx), %eax +; X86-AVX-NEXT: vmovd %eax, %xmm1 +; X86-AVX-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%esi,%ecx,4) +; X86-AVX-NEXT: vmovd %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: popl %edi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16_sext: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm0, %xmm2 +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16_sext: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: movswl 2(%rdi,%rdx), %ecx +; X64-AVX-NEXT: movswl (%rdi,%rdx), %edi +; X64-AVX-NEXT: vmovd %edi, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: movswl 2(%rsi,%rdx), %ecx +; X64-AVX-NEXT: movswl (%rsi,%rdx), %esi +; X64-AVX-NEXT: vmovd %esi, %xmm1 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 + %tmp8 = sext <2 x i16> %wide.load to <2 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <2 x i16>* + %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 + %tmp12 = sext <2 x i16> %wide.load17 to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <2 x i16> +; %op1 = sext<2 x i32> %val1 +; %val2 = load <2 x i16> +; %op2 = zext<2 x i32> %val2 +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { +; X86-SSE-LABEL: mul_2xi16_sext_zext: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; X86-SSE-NEXT: psrad $16, %xmm0 +; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm3 +; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 +; X86-SSE-NEXT: movd %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: movd %xmm3, 4(%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi16_sext_zext: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %edi +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: movswl 2(%edx,%ecx), %edi +; X86-AVX-NEXT: movswl (%edx,%ecx), %edx +; X86-AVX-NEXT: vmovd %edx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%esi,%ecx,4) +; X86-AVX-NEXT: vmovd %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: popl %edi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16_sext_zext: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; X64-SSE-NEXT: psrad $16, %xmm0 +; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm2, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16_sext_zext: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: movswl 2(%rdi,%rdx), %ecx +; X64-AVX-NEXT: movswl (%rdi,%rdx), %edi +; X64-AVX-NEXT: vmovd %edi, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 + %tmp8 = sext <2 x i16> %wide.load to <2 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <2 x i16>* + %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 + %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val1 = load <16 x i16> +; %op1 = sext<16 x i32> %val1 +; %val2 = load <16 x i16> +; %op2 = sext<16 x i32> %val2 +; %rst = mul <16 x i32> %op1, %op2 +; +define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { +; X86-SSE-LABEL: mul_16xi16_sext: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 +; X86-SSE-NEXT: movdqu 16(%edx,%ecx), %xmm1 +; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm2 +; X86-SSE-NEXT: movdqu 16(%eax,%ecx), %xmm3 +; X86-SSE-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE-NEXT: pmulhw %xmm0, %xmm4 +; X86-SSE-NEXT: pmullw %xmm0, %xmm2 +; X86-SSE-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X86-SSE-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm4 +; X86-SSE-NEXT: pmullw %xmm1, %xmm3 +; X86-SSE-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X86-SSE-NEXT: movdqu %xmm3, 48(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm1, 32(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm2, 16(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: mul_16xi16_sext: +; X86-AVX1: # %bb.0: # %entry +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: movl c, %esi +; X86-AVX1-NEXT: vpmovsxwd 16(%edx,%ecx), %xmm0 +; X86-AVX1-NEXT: vpmovsxwd 24(%edx,%ecx), %xmm1 +; X86-AVX1-NEXT: vpmovsxwd (%edx,%ecx), %xmm2 +; X86-AVX1-NEXT: vpmovsxwd 8(%edx,%ecx), %xmm3 +; X86-AVX1-NEXT: vpmovsxwd 16(%eax,%ecx), %xmm4 +; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpmovsxwd 24(%eax,%ecx), %xmm4 +; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vpmovsxwd (%eax,%ecx), %xmm4 +; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpmovsxwd 8(%eax,%ecx), %xmm4 +; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4) +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: mul_16xi16_sext: +; X86-AVX2: # %bb.0: # %entry +; X86-AVX2-NEXT: pushl %esi +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX2-NEXT: movl c, %esi +; X86-AVX2-NEXT: vpmovsxwd 16(%edx,%ecx), %ymm0 +; X86-AVX2-NEXT: vpmovsxwd (%edx,%ecx), %ymm1 +; X86-AVX2-NEXT: vpmovsxwd 16(%eax,%ecx), %ymm2 +; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 +; X86-AVX2-NEXT: vpmovsxwd (%eax,%ecx), %ymm2 +; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) +; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) +; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: mul_16xi16_sext: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 +; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1 +; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2 +; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3 +; X64-SSE-NEXT: movdqa %xmm2, %xmm4 +; X64-SSE-NEXT: pmulhw %xmm0, %xmm4 +; X64-SSE-NEXT: pmullw %xmm0, %xmm2 +; X64-SSE-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X64-SSE-NEXT: movdqa %xmm3, %xmm4 +; X64-SSE-NEXT: pmulhw %xmm1, %xmm4 +; X64-SSE-NEXT: pmullw %xmm1, %xmm3 +; X64-SSE-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: mul_16xi16_sext: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX1-NEXT: vpmovsxwd 16(%rdi,%rdx), %xmm0 +; X64-AVX1-NEXT: vpmovsxwd 24(%rdi,%rdx), %xmm1 +; X64-AVX1-NEXT: vpmovsxwd (%rdi,%rdx), %xmm2 +; X64-AVX1-NEXT: vpmovsxwd 8(%rdi,%rdx), %xmm3 +; X64-AVX1-NEXT: vpmovsxwd 16(%rsi,%rdx), %xmm4 +; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X64-AVX1-NEXT: vpmovsxwd 24(%rsi,%rdx), %xmm4 +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 +; X64-AVX1-NEXT: vpmovsxwd (%rsi,%rdx), %xmm4 +; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vpmovsxwd 8(%rsi,%rdx), %xmm4 +; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: mul_16xi16_sext: +; X64-AVX2: # %bb.0: # %entry +; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX2-NEXT: vpmovsxwd 16(%rdi,%rdx), %ymm0 +; X64-AVX2-NEXT: vpmovsxwd (%rdi,%rdx), %ymm1 +; X64-AVX2-NEXT: vpmovsxwd 16(%rsi,%rdx), %ymm2 +; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 +; X64-AVX2-NEXT: vpmovsxwd (%rsi,%rdx), %ymm2 +; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) +; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <16 x i16>* + %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1 + %tmp8 = sext <16 x i16> %wide.load to <16 x i32> + %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index + %tmp11 = bitcast i8* %tmp10 to <16 x i16>* + %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1 + %tmp12 = sext <16 x i16> %wide.load17 to <16 x i32> + %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <16 x i32>* + store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i8> +; %op1 = zext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 255) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) { +; X86-SSE-LABEL: mul_2xi8_varconst1: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi8_varconst1: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: movzbl 1(%ecx,%eax), %esi +; X86-AVX-NEXT: movzbl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) +; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 4 +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_varconst1: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_varconst1: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: movzbl 1(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movzbl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = zext <2 x i8> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i8> +; %op1 = sext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 127) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) { +; X86-SSE-LABEL: mul_2xi8_varconst2: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X86-SSE-NEXT: psrad $16, %xmm0 +; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi8_varconst2: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: movsbl 1(%ecx,%eax), %esi +; X86-AVX-NEXT: movsbl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) +; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 4 +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_varconst2: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X64-SSE-NEXT: psrad $16, %xmm0 +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_varconst2: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: movsbl 1(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movsbl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = sext <2 x i8> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i8> +; %op1 = zext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 256) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) { +; X86-SSE-LABEL: mul_2xi8_varconst3: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi8_varconst3: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: movzbl 1(%ecx,%eax), %esi +; X86-AVX-NEXT: movzbl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) +; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 4 +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_varconst3: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_varconst3: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: movzbl 1(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movzbl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = zext <2 x i8> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i8> +; %op1 = zext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-1 ~ 255) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) { +; X86-SSE-LABEL: mul_2xi8_varconst4: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u> +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi8_varconst4: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: movzbl 1(%ecx,%eax), %esi +; X86-AVX-NEXT: movzbl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) +; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 4 +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_varconst4: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u> +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_varconst4: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: movzbl 1(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movzbl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = zext <2 x i8> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i8> +; %op1 = sext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-129 ~ 127) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) { +; X86-SSE-LABEL: mul_2xi8_varconst5: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u> +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi8_varconst5: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: movsbl 1(%ecx,%eax), %esi +; X86-AVX-NEXT: movsbl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) +; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 4 +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_varconst5: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u> +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_varconst5: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: movsbl 1(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movsbl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = sext <2 x i8> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i8> +; %op1 = sext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 128) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) { +; X86-SSE-LABEL: mul_2xi8_varconst6: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u> +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi8_varconst6: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: movsbl 1(%ecx,%eax), %esi +; X86-AVX-NEXT: movsbl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) +; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 4 +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_varconst6: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u> +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_varconst6: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: movsbl 1(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movsbl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i8>* + %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 + %tmp8 = sext <2 x i8> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i16> +; %op1 = zext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65535) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) { +; X86-SSE-LABEL: mul_2xi16_varconst1: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u> +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmulhuw %xmm1, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi16_varconst1: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) +; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16_varconst1: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u> +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmulhuw %xmm1, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16_varconst1: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 + %tmp8 = zext <2 x i16> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i16> +; %op1 = sext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-32768 ~ 32767) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) { +; X86-SSE-LABEL: mul_2xi16_varconst2: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u> +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi16_varconst2: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: movswl 2(%ecx,%eax), %esi +; X86-AVX-NEXT: movswl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) +; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 4 +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16_varconst2: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u> +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16_varconst2: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: movswl 2(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movswl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 + %tmp8 = sext <2 x i16> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i16> +; %op1 = zext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65536) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) { +; X86-SSE-LABEL: mul_2xi16_varconst3: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = <0,65536,u,u> +; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm1, %xmm3 +; X86-SSE-NEXT: pmuludq %xmm2, %xmm0 +; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: movd %xmm3, 4(%edx,%eax,4) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi16_varconst3: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) +; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16_varconst3: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65536,u,u> +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16_varconst3: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 + %tmp8 = zext <2 x i16> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; %val = load <2 x i16> +; %op1 = sext<2 x i32> %val +; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 32768) +; %rst = mul <2 x i32> %op1, %op2 +; +define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) { +; X86-SSE-LABEL: mul_2xi16_varconst4: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; X86-SSE-NEXT: psrad $16, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = <0,32768,u,u> +; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm1, %xmm3 +; X86-SSE-NEXT: pmuludq %xmm2, %xmm0 +; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: movd %xmm3, 4(%edx,%eax,4) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi16_varconst4: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: movswl 2(%ecx,%eax), %esi +; X86-AVX-NEXT: movswl (%ecx,%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) +; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 4 +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16_varconst4: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; X64-SSE-NEXT: psrad $16, %xmm0 +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,32768,u,u> +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16_varconst4: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: movswl 2(%rdi,%rsi), %ecx +; X64-AVX-NEXT: movswl (%rdi,%rsi), %edx +; X64-AVX-NEXT: vmovd %edx, %xmm0 +; X64-AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq +entry: + %pre = load i32*, i32** @c + %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index + %tmp7 = bitcast i8* %tmp6 to <2 x i16>* + %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 + %tmp8 = sext <2 x i16> %wide.load to <2 x i32> + %tmp13 = mul nuw nsw <2 x i32> %tmp8, + %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index + %tmp15 = bitcast i32* %tmp14 to <2 x i32>* + store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 + ret void +} + +; +; Illegal Types +; + +define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind { +; X86-SSE-LABEL: PR34947: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movdqa (%eax), %xmm5 +; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movdqa (%ecx), %xmm3 +; X86-SSE-NEXT: movdqa 16(%ecx), %xmm6 +; X86-SSE-NEXT: pxor %xmm0, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X86-SSE-NEXT: movdqa %xmm5, %xmm4 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,1,2,3] +; X86-SSE-NEXT: movd %xmm0, %eax +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,1,2,3] +; X86-SSE-NEXT: movd %xmm0, %esi +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl %esi +; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,0,1] +; X86-SSE-NEXT: movd %xmm2, %eax +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,3,0,1] +; X86-SSE-NEXT: movd %xmm2, %esi +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl %esi +; X86-SSE-NEXT: movd %edx, %xmm7 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; X86-SSE-NEXT: movd %xmm5, %eax +; X86-SSE-NEXT: movd %xmm6, %esi +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl %esi +; X86-SSE-NEXT: movd %edx, %xmm2 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] +; X86-SSE-NEXT: movd %xmm5, %eax +; X86-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3] +; X86-SSE-NEXT: movd %xmm5, %esi +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl %esi +; X86-SSE-NEXT: movd %edx, %xmm5 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[3,1,2,3] +; X86-SSE-NEXT: movd %xmm6, %eax +; X86-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[3,1,2,3] +; X86-SSE-NEXT: movd %xmm6, %esi +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl %esi +; X86-SSE-NEXT: movd %edx, %xmm6 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,0,1] +; X86-SSE-NEXT: movd %xmm7, %eax +; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,0,1] +; X86-SSE-NEXT: movd %xmm7, %esi +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl %esi +; X86-SSE-NEXT: movd %edx, %xmm7 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; X86-SSE-NEXT: movd %xmm4, %eax +; X86-SSE-NEXT: movd %xmm3, %esi +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl %esi +; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] +; X86-SSE-NEXT: movd %xmm4, %eax +; X86-SSE-NEXT: movd %edx, %xmm4 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] +; X86-SSE-NEXT: movd %xmm3, %esi +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl %esi +; X86-SSE-NEXT: movd %edx, %xmm3 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0] +; X86-SSE-NEXT: movd %xmm1, %eax +; X86-SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm6[0,0] +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [8199,8199,8199,8199] +; X86-SSE-NEXT: pmuludq %xmm1, %xmm4 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; X86-SSE-NEXT: pmuludq %xmm1, %xmm3 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; X86-SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm0[0,0] +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl 32(%ecx) +; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm5 +; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,2,3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X86-SSE-NEXT: movl $8199, %eax # imm = 0x2007 +; X86-SSE-NEXT: movd %eax, %xmm2 +; X86-SSE-NEXT: pmuludq %xmm0, %xmm2 +; X86-SSE-NEXT: movd %xmm2, (%eax) +; X86-SSE-NEXT: movdqa %xmm1, (%eax) +; X86-SSE-NEXT: movdqa %xmm4, (%eax) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: PR34947: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: pushl %ebp +; X86-AVX1-NEXT: pushl %ebx +; X86-AVX1-NEXT: pushl %edi +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: subl $16, %esp +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vmovd %xmm1, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl 32(%ecx) +; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-AVX1-NEXT: vpextrd $3, %xmm2, %eax +; X86-AVX1-NEXT: vmovdqa (%ecx), %xmm3 +; X86-AVX1-NEXT: vmovdqa 16(%ecx), %xmm1 +; X86-AVX1-NEXT: vpextrd $3, %xmm3, %ecx +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl %ecx +; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-AVX1-NEXT: vpextrd $2, %xmm2, %eax +; X86-AVX1-NEXT: vpextrd $2, %xmm3, %ecx +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl %ecx +; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-AVX1-NEXT: vpextrd $1, %xmm2, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm3, %ecx +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl %ecx +; X86-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-AVX1-NEXT: vmovd %xmm2, %eax +; X86-AVX1-NEXT: vmovd %xmm3, %ecx +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl %ecx +; X86-AVX1-NEXT: movl %edx, %ebp +; X86-AVX1-NEXT: vpextrd $3, %xmm0, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: vpextrd $3, %xmm1, %ecx +; X86-AVX1-NEXT: divl %ecx +; X86-AVX1-NEXT: movl %edx, %ebx +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: vpextrd $2, %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $2, %xmm1, %esi +; X86-AVX1-NEXT: divl %esi +; X86-AVX1-NEXT: movl %edx, %esi +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm1, %edi +; X86-AVX1-NEXT: divl %edi +; X86-AVX1-NEXT: movl %edx, %edi +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vmovd %xmm1, %ecx +; X86-AVX1-NEXT: divl %ecx +; X86-AVX1-NEXT: vmovd %edx, %xmm0 +; X86-AVX1-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpinsrd $3, %ebx, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %ebp, %xmm1 +; X86-AVX1-NEXT: vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload +; X86-AVX1-NEXT: vpinsrd $2, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; X86-AVX1-NEXT: vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; X86-AVX1-NEXT: vmovd {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 4-byte Folded Reload +; X86-AVX1-NEXT: # xmm2 = mem[0],zero,zero,zero +; X86-AVX1-NEXT: movl $8199, %eax # imm = 0x2007 +; X86-AVX1-NEXT: vmovd %eax, %xmm3 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8199,8199,8199,8199] +; X86-AVX1-NEXT: vpmulld %xmm4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpmulld %xmm4, %xmm1, %xmm1 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm1 +; X86-AVX1-NEXT: vmovd %xmm1, (%eax) +; X86-AVX1-NEXT: vmovaps %ymm0, (%eax) +; X86-AVX1-NEXT: addl $16, %esp +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: popl %edi +; X86-AVX1-NEXT: popl %ebx +; X86-AVX1-NEXT: popl %ebp +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: PR34947: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: pushl %edi +; X86-AVX2-NEXT: pushl %esi +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vmovdqa (%esi), %xmm2 +; X86-AVX2-NEXT: vmovdqa 16(%esi), %xmm3 +; X86-AVX2-NEXT: vpextrd $1, %xmm3, %ecx +; X86-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 +; X86-AVX2-NEXT: vpextrd $1, %xmm4, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %ecx +; X86-AVX2-NEXT: movl %edx, %ecx +; X86-AVX2-NEXT: vmovd %xmm3, %edi +; X86-AVX2-NEXT: vmovd %xmm4, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %edi +; X86-AVX2-NEXT: vmovd %edx, %xmm5 +; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm5, %xmm5 +; X86-AVX2-NEXT: vpextrd $2, %xmm3, %ecx +; X86-AVX2-NEXT: vpextrd $2, %xmm4, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %ecx +; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5 +; X86-AVX2-NEXT: vpextrd $3, %xmm3, %ecx +; X86-AVX2-NEXT: vpextrd $3, %xmm4, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %ecx +; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm5, %xmm3 +; X86-AVX2-NEXT: vpextrd $1, %xmm2, %ecx +; X86-AVX2-NEXT: vpextrd $1, %xmm1, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %ecx +; X86-AVX2-NEXT: movl %edx, %ecx +; X86-AVX2-NEXT: vmovd %xmm2, %edi +; X86-AVX2-NEXT: vmovd %xmm1, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %edi +; X86-AVX2-NEXT: vmovd %edx, %xmm4 +; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm4, %xmm4 +; X86-AVX2-NEXT: vpextrd $2, %xmm2, %ecx +; X86-AVX2-NEXT: vpextrd $2, %xmm1, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %ecx +; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4 +; X86-AVX2-NEXT: vpextrd $3, %xmm2, %ecx +; X86-AVX2-NEXT: vpextrd $3, %xmm1, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %ecx +; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm4, %xmm1 +; X86-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl 32(%esi) +; X86-AVX2-NEXT: vmovd %edx, %xmm0 +; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [8199,8199,8199,8199,8199,8199,8199,8199] +; X86-AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; X86-AVX2-NEXT: movl $8199, %eax # imm = 0x2007 +; X86-AVX2-NEXT: vmovd %eax, %xmm2 +; X86-AVX2-NEXT: vpmulld %xmm2, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, (%eax) +; X86-AVX2-NEXT: vmovdqa %ymm1, (%eax) +; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: popl %edi +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: PR34947: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movdqa (%rdi), %xmm5 +; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movdqa (%rsi), %xmm2 +; X64-SSE-NEXT: movdqa 16(%rsi), %xmm6 +; X64-SSE-NEXT: pxor %xmm0, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X64-SSE-NEXT: movdqa %xmm5, %xmm4 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,1,2,3] +; X64-SSE-NEXT: movd %xmm0, %eax +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,1,2,3] +; X64-SSE-NEXT: movd %xmm0, %ecx +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl %ecx +; X64-SSE-NEXT: movd %edx, %xmm8 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,0,1] +; X64-SSE-NEXT: movd %xmm3, %eax +; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,3,0,1] +; X64-SSE-NEXT: movd %xmm3, %ecx +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl %ecx +; X64-SSE-NEXT: movd %edx, %xmm7 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; X64-SSE-NEXT: movd %xmm5, %eax +; X64-SSE-NEXT: movd %xmm6, %ecx +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl %ecx +; X64-SSE-NEXT: movd %edx, %xmm3 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] +; X64-SSE-NEXT: movd %xmm5, %eax +; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3] +; X64-SSE-NEXT: movd %xmm5, %ecx +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl %ecx +; X64-SSE-NEXT: movd %edx, %xmm5 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[3,1,2,3] +; X64-SSE-NEXT: movd %xmm6, %eax +; X64-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[3,1,2,3] +; X64-SSE-NEXT: movd %xmm6, %ecx +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl %ecx +; X64-SSE-NEXT: movd %edx, %xmm6 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,0,1] +; X64-SSE-NEXT: movd %xmm7, %eax +; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,0,1] +; X64-SSE-NEXT: movd %xmm7, %ecx +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl %ecx +; X64-SSE-NEXT: movd %edx, %xmm7 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; X64-SSE-NEXT: movd %xmm4, %eax +; X64-SSE-NEXT: movd %xmm2, %ecx +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl %ecx +; X64-SSE-NEXT: movd %edx, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] +; X64-SSE-NEXT: movd %xmm4, %eax +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; X64-SSE-NEXT: movd %xmm2, %ecx +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl %ecx +; X64-SSE-NEXT: movd %edx, %xmm2 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; X64-SSE-NEXT: movd %xmm1, %eax +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl 32(%rsi) +; X64-SSE-NEXT: movd %edx, %xmm1 +; X64-SSE-NEXT: movdqa {{.*#+}} xmm4 = [8199,8199,8199,8199] +; X64-SSE-NEXT: pmuludq %xmm4, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm6[0,0] +; X64-SSE-NEXT: pmuludq %xmm4, %xmm2 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-SSE-NEXT: pmuludq %xmm4, %xmm3 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; X64-SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm8[0,0] +; X64-SSE-NEXT: pmuludq %xmm4, %xmm5 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X64-SSE-NEXT: movl $8199, %eax # imm = 0x2007 +; X64-SSE-NEXT: movd %eax, %xmm3 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm3 +; X64-SSE-NEXT: movd %xmm3, (%rax) +; X64-SSE-NEXT: movdqa %xmm2, (%rax) +; X64-SSE-NEXT: movdqa %xmm0, (%rax) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: PR34947: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: pushq %rbp +; X64-AVX1-NEXT: pushq %rbx +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vmovd %xmm1, %eax +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl 32(%rsi) +; X64-AVX1-NEXT: movl %edx, %r8d +; X64-AVX1-NEXT: vpextrd $3, %xmm2, %eax +; X64-AVX1-NEXT: vmovdqa (%rsi), %xmm3 +; X64-AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 +; X64-AVX1-NEXT: vpextrd $3, %xmm3, %ecx +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %r9d +; X64-AVX1-NEXT: vpextrd $2, %xmm2, %eax +; X64-AVX1-NEXT: vpextrd $2, %xmm3, %ecx +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %r10d +; X64-AVX1-NEXT: vpextrd $1, %xmm2, %eax +; X64-AVX1-NEXT: vpextrd $1, %xmm3, %ecx +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %r11d +; X64-AVX1-NEXT: vmovd %xmm2, %eax +; X64-AVX1-NEXT: vmovd %xmm3, %ecx +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %esi +; X64-AVX1-NEXT: vpextrd $3, %xmm0, %eax +; X64-AVX1-NEXT: vpextrd $3, %xmm1, %ecx +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %edi +; X64-AVX1-NEXT: vpextrd $2, %xmm0, %eax +; X64-AVX1-NEXT: vpextrd $2, %xmm1, %ecx +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %ecx +; X64-AVX1-NEXT: vpextrd $1, %xmm0, %eax +; X64-AVX1-NEXT: vpextrd $1, %xmm1, %ebx +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ebx +; X64-AVX1-NEXT: movl %edx, %ebx +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: vmovd %xmm1, %ebp +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ebp +; X64-AVX1-NEXT: vmovd %edx, %xmm0 +; X64-AVX1-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199] +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %esi, %xmm2 +; X64-AVX1-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpinsrd $2, %r10d, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpinsrd $3, %r9d, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vmovd %r8d, %xmm1 +; X64-AVX1-NEXT: movl $8199, %eax # imm = 0x2007 +; X64-AVX1-NEXT: vmovd %eax, %xmm2 +; X64-AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; X64-AVX1-NEXT: vmovd %xmm1, (%rax) +; X64-AVX1-NEXT: vmovaps %ymm0, (%rax) +; X64-AVX1-NEXT: popq %rbx +; X64-AVX1-NEXT: popq %rbp +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: PR34947: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vmovdqa (%rsi), %xmm2 +; X64-AVX2-NEXT: vmovdqa 16(%rsi), %xmm3 +; X64-AVX2-NEXT: vpextrd $1, %xmm3, %ecx +; X64-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 +; X64-AVX2-NEXT: vpextrd $1, %xmm4, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: movl %edx, %ecx +; X64-AVX2-NEXT: vmovd %xmm3, %edi +; X64-AVX2-NEXT: vmovd %xmm4, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %edi +; X64-AVX2-NEXT: vmovd %edx, %xmm5 +; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm5, %xmm5 +; X64-AVX2-NEXT: vpextrd $2, %xmm3, %ecx +; X64-AVX2-NEXT: vpextrd $2, %xmm4, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5 +; X64-AVX2-NEXT: vpextrd $3, %xmm3, %ecx +; X64-AVX2-NEXT: vpextrd $3, %xmm4, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm5, %xmm3 +; X64-AVX2-NEXT: vpextrd $1, %xmm2, %ecx +; X64-AVX2-NEXT: vpextrd $1, %xmm1, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: movl %edx, %ecx +; X64-AVX2-NEXT: vmovd %xmm2, %edi +; X64-AVX2-NEXT: vmovd %xmm1, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %edi +; X64-AVX2-NEXT: vmovd %edx, %xmm4 +; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm4, %xmm4 +; X64-AVX2-NEXT: vpextrd $2, %xmm2, %ecx +; X64-AVX2-NEXT: vpextrd $2, %xmm1, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4 +; X64-AVX2-NEXT: vpextrd $3, %xmm2, %ecx +; X64-AVX2-NEXT: vpextrd $3, %xmm1, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm4, %xmm1 +; X64-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl 32(%rsi) +; X64-AVX2-NEXT: vmovd %edx, %xmm0 +; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [8199,8199,8199,8199,8199,8199,8199,8199] +; X64-AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: movl $8199, %eax # imm = 0x2007 +; X64-AVX2-NEXT: vmovd %eax, %xmm2 +; X64-AVX2-NEXT: vpmulld %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, (%rax) +; X64-AVX2-NEXT: vmovdqa %ymm1, (%rax) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq + %a0 = load <9 x i16>, <9 x i16>* %p0, align 64 + %a1 = load <9 x i32>, <9 x i32>* %p1, align 64 + %ext0 = zext <9 x i16> %a0 to <9 x i32> + %rem = urem <9 x i32> %ext0, %a1 + %mul = mul <9 x i32> , %rem + store <9 x i32> %mul, <9 x i32>* undef, align 64 + ret void +} diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int-widen.ll b/llvm/test/CodeGen/X86/vec_fp_to_int-widen.ll new file mode 100644 index 000000000000..f4e0147e3f1a --- /dev/null +++ b/llvm/test/CodeGen/X86/vec_fp_to_int-widen.ll @@ -0,0 +1,2662 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=VEX --check-prefix=AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=VEX --check-prefix=AVX2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VL +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512DQ +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VLDQ +; +; 32-bit tests to make sure we're not doing anything stupid. +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 + +; +; Double to Signed Integer +; + +define <2 x i64> @fptosi_2f64_to_2i64(<2 x double> %a) { +; SSE-LABEL: fptosi_2f64_to_2i64: +; SSE: # %bb.0: +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; VEX-LABEL: fptosi_2f64_to_2i64: +; VEX: # %bb.0: +; VEX-NEXT: vcvttsd2si %xmm0, %rax +; VEX-NEXT: vmovq %rax, %xmm1 +; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; VEX-NEXT: vcvttsd2si %xmm0, %rax +; VEX-NEXT: vmovq %rax, %xmm0 +; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; VEX-NEXT: retq +; +; AVX512F-LABEL: fptosi_2f64_to_2i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vcvttsd2si %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vcvttsd2si %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptosi_2f64_to_2i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm1 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptosi_2f64_to_2i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptosi_2f64_to_2i64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttpd2qq %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %cvt = fptosi <2 x double> %a to <2 x i64> + ret <2 x i64> %cvt +} + +define <4 x i32> @fptosi_2f64_to_4i32(<2 x double> %a) { +; SSE-LABEL: fptosi_2f64_to_4i32: +; SSE: # %bb.0: +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_2f64_to_4i32: +; AVX: # %bb.0: +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: retq + %cvt = fptosi <2 x double> %a to <2 x i32> + %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %ext +} + +define <2 x i32> @fptosi_2f64_to_2i32(<2 x double> %a) { +; SSE-LABEL: fptosi_2f64_to_2i32: +; SSE: # %bb.0: +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_2f64_to_2i32: +; AVX: # %bb.0: +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: retq + %cvt = fptosi <2 x double> %a to <2 x i32> + ret <2 x i32> %cvt +} + +define <4 x i32> @fptosi_4f64_to_2i32(<2 x double> %a) { +; SSE-LABEL: fptosi_4f64_to_2i32: +; SSE: # %bb.0: +; SSE-NEXT: cvttpd2dq %xmm0, %xmm1 +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_4f64_to_2i32: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq + %ext = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> + %cvt = fptosi <4 x double> %ext to <4 x i32> + ret <4 x i32> %cvt +} + +define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) { +; SSE-LABEL: fptosi_4f64_to_4i64: +; SSE: # %bb.0: +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: cvttsd2si %xmm1, %rax +; SSE-NEXT: movq %rax, %xmm3 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: cvttsd2si %xmm1, %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: fptosi_4f64_to_4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vcvttsd2si %xmm1, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-NEXT: vcvttsd2si %xmm1, %rax +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vcvttsd2si %xmm0, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vcvttsd2si %xmm0, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptosi_4f64_to_4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vcvttsd2si %xmm1, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX2-NEXT: vcvttsd2si %xmm1, %rax +; AVX2-NEXT: vmovq %rax, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vcvttsd2si %xmm0, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-NEXT: vcvttsd2si %xmm0, %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptosi_4f64_to_4i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vcvttsd2si %xmm1, %rax +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512F-NEXT: vcvttsd2si %xmm1, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vcvttsd2si %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vcvttsd2si %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptosi_4f64_to_4i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vcvttsd2si %xmm1, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm2 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512VL-NEXT: vcvttsd2si %xmm1, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm2 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptosi_4f64_to_4i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptosi_4f64_to_4i64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttpd2qq %ymm0, %ymm0 +; AVX512VLDQ-NEXT: retq + %cvt = fptosi <4 x double> %a to <4 x i64> + ret <4 x i64> %cvt +} + +define <4 x i32> @fptosi_4f64_to_4i32(<4 x double> %a) { +; SSE-LABEL: fptosi_4f64_to_4i32: +; SSE: # %bb.0: +; SSE-NEXT: cvttpd2dq %xmm1, %xmm1 +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_4f64_to_4i32: +; AVX: # %bb.0: +; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq + %cvt = fptosi <4 x double> %a to <4 x i32> + ret <4 x i32> %cvt +} + +; +; Double to Unsigned Integer +; + +define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) { +; SSE-LABEL: fptoui_2f64_to_2i64: +; SSE: # %bb.0: +; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: subsd %xmm2, %xmm1 +; SSE-NEXT: cvttsd2si %xmm1, %rax +; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; SSE-NEXT: xorq %rcx, %rax +; SSE-NEXT: cvttsd2si %xmm0, %rdx +; SSE-NEXT: ucomisd %xmm2, %xmm0 +; SSE-NEXT: cmovaeq %rax, %rdx +; SSE-NEXT: movq %rdx, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movapd %xmm0, %xmm3 +; SSE-NEXT: subsd %xmm2, %xmm3 +; SSE-NEXT: cvttsd2si %xmm3, %rax +; SSE-NEXT: xorq %rcx, %rax +; SSE-NEXT: cvttsd2si %xmm0, %rcx +; SSE-NEXT: ucomisd %xmm2, %xmm0 +; SSE-NEXT: cmovaeq %rax, %rcx +; SSE-NEXT: movq %rcx, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; VEX-LABEL: fptoui_2f64_to_2i64: +; VEX: # %bb.0: +; VEX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm2 +; VEX-NEXT: vcvttsd2si %xmm2, %rax +; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; VEX-NEXT: xorq %rcx, %rax +; VEX-NEXT: vcvttsd2si %xmm0, %rdx +; VEX-NEXT: vucomisd %xmm1, %xmm0 +; VEX-NEXT: cmovaeq %rax, %rdx +; VEX-NEXT: vmovq %rdx, %xmm2 +; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm3 +; VEX-NEXT: vcvttsd2si %xmm3, %rax +; VEX-NEXT: xorq %rcx, %rax +; VEX-NEXT: vcvttsd2si %xmm0, %rcx +; VEX-NEXT: vucomisd %xmm1, %xmm0 +; VEX-NEXT: cmovaeq %rax, %rcx +; VEX-NEXT: vmovq %rcx, %xmm0 +; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; VEX-NEXT: retq +; +; AVX512F-LABEL: fptoui_2f64_to_2i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_2f64_to_2i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm1 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_2f64_to_2i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_2f64_to_2i64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttpd2uqq %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %cvt = fptoui <2 x double> %a to <2 x i64> + ret <2 x i64> %cvt +} + +define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) { +; SSE-LABEL: fptoui_2f64_to_4i32: +; SSE: # %bb.0: +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttsd2si %xmm0, %rcx +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: movd %ecx, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; SSE-NEXT: retq +; +; AVX1-LABEL: fptoui_2f64_to_4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] +; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm3 +; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX1-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm2, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_2f64_to_4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] +; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1 +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptoui_2f64_to_4i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0 +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_2f64_to_4i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttpd2udq %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_2f64_to_4i32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0 +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_2f64_to_4i32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttpd2udq %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %cvt = fptoui <2 x double> %a to <2 x i32> + %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %ext +} + +define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) { +; SSE-LABEL: fptoui_2f64_to_2i32: +; SSE: # %bb.0: +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: fptoui_2f64_to_2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] +; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm3 +; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX1-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm2, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_2f64_to_2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] +; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1 +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptoui_2f64_to_2i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_2f64_to_2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttpd2udq %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_2f64_to_2i32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_2f64_to_2i32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttpd2udq %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %cvt = fptoui <2 x double> %a to <2 x i32> + %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> + ret <4 x i32> %ext +} + +define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) { +; SSE-LABEL: fptoui_4f64_to_2i32: +; SSE: # %bb.0: +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero +; SSE-NEXT: retq +; +; AVX1-LABEL: fptoui_4f64_to_2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovapd %xmm0, %xmm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] +; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm1 +; AVX1-NEXT: vcvttpd2dq %ymm1, %xmm1 +; AVX1-NEXT: vxorpd {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_4f64_to_2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovapd %xmm0, %xmm0 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] +; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1 +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptoui_4f64_to_2i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps %xmm0, %xmm0 +; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_4f64_to_2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovaps %xmm0, %xmm0 +; AVX512VL-NEXT: vcvttpd2udq %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_4f64_to_2i32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovaps %xmm0, %xmm0 +; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_4f64_to_2i32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vmovaps %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vcvttpd2udq %ymm0, %xmm0 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq + %ext = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <4 x i32> + %cvt = fptoui <4 x double> %ext to <4 x i32> + ret <4 x i32> %cvt +} + +define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { +; SSE-LABEL: fptoui_4f64_to_4i64: +; SSE: # %bb.0: +; SSE-NEXT: movapd %xmm0, %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero +; SSE-NEXT: subsd %xmm3, %xmm0 +; SSE-NEXT: cvttsd2si %xmm0, %rcx +; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: cvttsd2si %xmm2, %rdx +; SSE-NEXT: ucomisd %xmm3, %xmm2 +; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: movq %rdx, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] +; SSE-NEXT: movapd %xmm2, %xmm4 +; SSE-NEXT: subsd %xmm3, %xmm4 +; SSE-NEXT: cvttsd2si %xmm4, %rcx +; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: cvttsd2si %xmm2, %rdx +; SSE-NEXT: ucomisd %xmm3, %xmm2 +; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: movq %rdx, %xmm2 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movapd %xmm1, %xmm2 +; SSE-NEXT: subsd %xmm3, %xmm2 +; SSE-NEXT: cvttsd2si %xmm2, %rcx +; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: cvttsd2si %xmm1, %rdx +; SSE-NEXT: ucomisd %xmm3, %xmm1 +; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: movq %rdx, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movapd %xmm1, %xmm4 +; SSE-NEXT: subsd %xmm3, %xmm4 +; SSE-NEXT: cvttsd2si %xmm4, %rcx +; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: cvttsd2si %xmm1, %rax +; SSE-NEXT: ucomisd %xmm3, %xmm1 +; SSE-NEXT: cmovaeq %rcx, %rax +; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: fptoui_4f64_to_4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vsubsd %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vcvttsd2si %xmm3, %rax +; AVX1-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttsd2si %xmm2, %rdx +; AVX1-NEXT: vucomisd %xmm1, %xmm2 +; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX1-NEXT: vsubsd %xmm1, %xmm2, %xmm4 +; AVX1-NEXT: vcvttsd2si %xmm4, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttsd2si %xmm2, %rdx +; AVX1-NEXT: vucomisd %xmm1, %xmm2 +; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vsubsd %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vcvttsd2si %xmm3, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttsd2si %xmm0, %rdx +; AVX1-NEXT: vucomisd %xmm1, %xmm0 +; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vsubsd %xmm1, %xmm0, %xmm4 +; AVX1-NEXT: vcvttsd2si %xmm4, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttsd2si %xmm0, %rcx +; AVX1-NEXT: vucomisd %xmm1, %xmm0 +; AVX1-NEXT: cmovaeq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_4f64_to_4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vsubsd %xmm1, %xmm2, %xmm3 +; AVX2-NEXT: vcvttsd2si %xmm3, %rax +; AVX2-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttsd2si %xmm2, %rdx +; AVX2-NEXT: vucomisd %xmm1, %xmm2 +; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm3 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX2-NEXT: vsubsd %xmm1, %xmm2, %xmm4 +; AVX2-NEXT: vcvttsd2si %xmm4, %rax +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttsd2si %xmm2, %rdx +; AVX2-NEXT: vucomisd %xmm1, %xmm2 +; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX2-NEXT: vsubsd %xmm1, %xmm0, %xmm3 +; AVX2-NEXT: vcvttsd2si %xmm3, %rax +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttsd2si %xmm0, %rdx +; AVX2-NEXT: vucomisd %xmm1, %xmm0 +; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm3 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-NEXT: vsubsd %xmm1, %xmm0, %xmm4 +; AVX2-NEXT: vcvttsd2si %xmm4, %rax +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttsd2si %xmm0, %rcx +; AVX2-NEXT: vucomisd %xmm1, %xmm0 +; AVX2-NEXT: cmovaeq %rax, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptoui_4f64_to_4i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vcvttsd2usi %xmm1, %rax +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512F-NEXT: vcvttsd2usi %xmm1, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_4f64_to_4i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vcvttsd2usi %xmm1, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm2 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512VL-NEXT: vcvttsd2usi %xmm1, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm2 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_4f64_to_4i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_4f64_to_4i64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttpd2uqq %ymm0, %ymm0 +; AVX512VLDQ-NEXT: retq + %cvt = fptoui <4 x double> %a to <4 x i64> + ret <4 x i64> %cvt +} + +define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) { +; SSE-LABEL: fptoui_4f64_to_4i32: +; SSE: # %bb.0: +; SSE-NEXT: cvttsd2si %xmm1, %rax +; SSE-NEXT: movd %eax, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: cvttsd2si %xmm1, %rax +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: fptoui_4f64_to_4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] +; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm1 +; AVX1-NEXT: vcvttpd2dq %ymm1, %xmm1 +; AVX1-NEXT: vxorpd {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_4f64_to_4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] +; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1 +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptoui_4f64_to_4i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_4f64_to_4i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttpd2udq %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_4f64_to_4i32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_4f64_to_4i32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttpd2udq %ymm0, %xmm0 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq + %cvt = fptoui <4 x double> %a to <4 x i32> + ret <4 x i32> %cvt +} + +; +; Float to Signed Integer +; + +define <2 x i32> @fptosi_2f32_to_2i32(<2 x float> %a) { +; SSE-LABEL: fptosi_2f32_to_2i32: +; SSE: # %bb.0: +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_2f32_to_2i32: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: retq + %cvt = fptosi <2 x float> %a to <2 x i32> + ret <2 x i32> %cvt +} + +define <4 x i32> @fptosi_4f32_to_4i32(<4 x float> %a) { +; SSE-LABEL: fptosi_4f32_to_4i32: +; SSE: # %bb.0: +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_4f32_to_4i32: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: retq + %cvt = fptosi <4 x float> %a to <4 x i32> + ret <4 x i32> %cvt +} + +define <2 x i64> @fptosi_2f32_to_2i64(<4 x float> %a) { +; SSE-LABEL: fptosi_2f32_to_2i64: +; SSE: # %bb.0: +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; VEX-LABEL: fptosi_2f32_to_2i64: +; VEX: # %bb.0: +; VEX-NEXT: vcvttss2si %xmm0, %rax +; VEX-NEXT: vmovq %rax, %xmm1 +; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; VEX-NEXT: vcvttss2si %xmm0, %rax +; VEX-NEXT: vmovq %rax, %xmm0 +; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; VEX-NEXT: retq +; +; AVX512F-LABEL: fptosi_2f32_to_2i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vcvttss2si %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512F-NEXT: vcvttss2si %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptosi_2f32_to_2i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttss2si %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm1 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvttss2si %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptosi_2f32_to_2i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptosi_2f32_to_2i64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %shuf = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> + %cvt = fptosi <2 x float> %shuf to <2 x i64> + ret <2 x i64> %cvt +} + +define <2 x i64> @fptosi_4f32_to_2i64(<4 x float> %a) { +; SSE-LABEL: fptosi_4f32_to_2i64: +; SSE: # %bb.0: +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; VEX-LABEL: fptosi_4f32_to_2i64: +; VEX: # %bb.0: +; VEX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; VEX-NEXT: vcvttss2si %xmm1, %rax +; VEX-NEXT: vcvttss2si %xmm0, %rcx +; VEX-NEXT: vmovq %rcx, %xmm0 +; VEX-NEXT: vmovq %rax, %xmm1 +; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; VEX-NEXT: retq +; +; AVX512F-LABEL: fptosi_4f32_to_2i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512F-NEXT: vcvttss2si %xmm1, %rax +; AVX512F-NEXT: vcvttss2si %xmm0, %rcx +; AVX512F-NEXT: vmovq %rcx, %xmm0 +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptosi_4f32_to_2i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvttss2si %xmm1, %rax +; AVX512VL-NEXT: vcvttss2si %xmm0, %rcx +; AVX512VL-NEXT: vmovq %rcx, %xmm0 +; AVX512VL-NEXT: vmovq %rax, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptosi_4f32_to_2i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptosi_4f32_to_2i64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %ymm0 +; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq + %cvt = fptosi <4 x float> %a to <4 x i64> + %shuf = shufflevector <4 x i64> %cvt, <4 x i64> undef, <2 x i32> + ret <2 x i64> %shuf +} + +define <8 x i32> @fptosi_8f32_to_8i32(<8 x float> %a) { +; SSE-LABEL: fptosi_8f32_to_8i32: +; SSE: # %bb.0: +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_8f32_to_8i32: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX-NEXT: retq + %cvt = fptosi <8 x float> %a to <8 x i32> + ret <8 x i32> %cvt +} + +define <4 x i64> @fptosi_4f32_to_4i64(<8 x float> %a) { +; SSE-LABEL: fptosi_4f32_to_4i64: +; SSE: # %bb.0: +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm2 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] +; SSE-NEXT: cvttss2si %xmm1, %rax +; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] +; SSE-NEXT: cvttss2si %xmm1, %rax +; SSE-NEXT: movq %rax, %xmm3 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: fptosi_4f32_to_4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX1-NEXT: vcvttss2si %xmm1, %rax +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX1-NEXT: vcvttss2si %xmm2, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vcvttss2si %xmm0, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vcvttss2si %xmm0, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptosi_4f32_to_4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX2-NEXT: vcvttss2si %xmm1, %rax +; AVX2-NEXT: vmovq %rax, %xmm1 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX2-NEXT: vcvttss2si %xmm2, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vcvttss2si %xmm0, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vcvttss2si %xmm0, %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptosi_4f32_to_4i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX512F-NEXT: vcvttss2si %xmm1, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512F-NEXT: vcvttss2si %xmm2, %rax +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vcvttss2si %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512F-NEXT: vcvttss2si %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptosi_4f32_to_4i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX512VL-NEXT: vcvttss2si %xmm1, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm1 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512VL-NEXT: vcvttss2si %xmm2, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm2 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VL-NEXT: vcvttss2si %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm2 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvttss2si %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptosi_4f32_to_4i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptosi_4f32_to_4i64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %ymm0 +; AVX512VLDQ-NEXT: retq + %shuf = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> + %cvt = fptosi <4 x float> %shuf to <4 x i64> + ret <4 x i64> %cvt +} + +define <4 x i64> @fptosi_8f32_to_4i64(<8 x float> %a) { +; SSE-LABEL: fptosi_8f32_to_4i64: +; SSE: # %bb.0: +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm2 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] +; SSE-NEXT: cvttss2si %xmm1, %rax +; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] +; SSE-NEXT: cvttss2si %xmm1, %rax +; SSE-NEXT: movq %rax, %xmm3 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: fptosi_8f32_to_4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX1-NEXT: vcvttss2si %xmm1, %rax +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX1-NEXT: vcvttss2si %xmm2, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vcvttss2si %xmm0, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vcvttss2si %xmm0, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptosi_8f32_to_4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX2-NEXT: vcvttss2si %xmm1, %rax +; AVX2-NEXT: vmovq %rax, %xmm1 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX2-NEXT: vcvttss2si %xmm2, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vcvttss2si %xmm0, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vcvttss2si %xmm0, %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptosi_8f32_to_4i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512F-NEXT: vcvttss2si %xmm1, %rax +; AVX512F-NEXT: vcvttss2si %xmm0, %rcx +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512F-NEXT: vcvttss2si %xmm1, %rdx +; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512F-NEXT: vcvttss2si %xmm0, %rsi +; AVX512F-NEXT: vmovq %rsi, %xmm0 +; AVX512F-NEXT: vmovq %rdx, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vmovq %rcx, %xmm1 +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptosi_8f32_to_4i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvttss2si %xmm1, %rax +; AVX512VL-NEXT: vcvttss2si %xmm0, %rcx +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512VL-NEXT: vcvttss2si %xmm1, %rdx +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512VL-NEXT: vcvttss2si %xmm0, %rsi +; AVX512VL-NEXT: vmovq %rsi, %xmm0 +; AVX512VL-NEXT: vmovq %rdx, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: vmovq %rcx, %xmm1 +; AVX512VL-NEXT: vmovq %rax, %xmm2 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptosi_8f32_to_4i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptosi_8f32_to_4i64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttps2qq %ymm0, %zmm0 +; AVX512VLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512VLDQ-NEXT: retq + %cvt = fptosi <8 x float> %a to <8 x i64> + %shuf = shufflevector <8 x i64> %cvt, <8 x i64> undef, <4 x i32> + ret <4 x i64> %shuf +} + +; +; Float to Unsigned Integer +; + +define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) { +; SSE-LABEL: fptoui_2f32_to_2i32: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: cmpltps %xmm2, %xmm1 +; SSE-NEXT: cvttps2dq %xmm0, %xmm3 +; SSE-NEXT: subps %xmm2, %xmm0 +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: xorps {{.*}}(%rip), %xmm0 +; SSE-NEXT: andps %xmm1, %xmm3 +; SSE-NEXT: andnps %xmm0, %xmm1 +; SSE-NEXT: orps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: fptoui_2f32_to_2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; AVX1-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vsubps %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 +; AVX1-NEXT: vxorps {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_2f32_to_2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; AVX2-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vsubps %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vcvttps2dq %xmm1, %xmm1 +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vxorps %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptoui_2f32_to_2i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_2f32_to_2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttps2udq %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_2f32_to_2i32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_2f32_to_2i32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttps2udq %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %cvt = fptoui <2 x float> %a to <2 x i32> + ret <2 x i32> %cvt +} + +define <4 x i32> @fptoui_4f32_to_4i32(<4 x float> %a) { +; SSE-LABEL: fptoui_4f32_to_4i32: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: cmpltps %xmm2, %xmm1 +; SSE-NEXT: cvttps2dq %xmm0, %xmm3 +; SSE-NEXT: subps %xmm2, %xmm0 +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: xorps {{.*}}(%rip), %xmm0 +; SSE-NEXT: andps %xmm1, %xmm3 +; SSE-NEXT: andnps %xmm0, %xmm1 +; SSE-NEXT: orps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: fptoui_4f32_to_4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; AVX1-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vsubps %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 +; AVX1-NEXT: vxorps {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_4f32_to_4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; AVX2-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vsubps %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vcvttps2dq %xmm1, %xmm1 +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vxorps %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptoui_4f32_to_4i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_4f32_to_4i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttps2udq %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_4f32_to_4i32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_4f32_to_4i32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttps2udq %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %cvt = fptoui <4 x float> %a to <4 x i32> + ret <4 x i32> %cvt +} + +define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) { +; SSE-LABEL: fptoui_2f32_to_2i64: +; SSE: # %bb.0: +; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: subss %xmm2, %xmm1 +; SSE-NEXT: cvttss2si %xmm1, %rax +; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; SSE-NEXT: xorq %rcx, %rax +; SSE-NEXT: cvttss2si %xmm0, %rdx +; SSE-NEXT: ucomiss %xmm2, %xmm0 +; SSE-NEXT: cmovaeq %rax, %rdx +; SSE-NEXT: movq %rdx, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: subss %xmm2, %xmm3 +; SSE-NEXT: cvttss2si %xmm3, %rax +; SSE-NEXT: xorq %rcx, %rax +; SSE-NEXT: cvttss2si %xmm0, %rcx +; SSE-NEXT: ucomiss %xmm2, %xmm0 +; SSE-NEXT: cmovaeq %rax, %rcx +; SSE-NEXT: movq %rcx, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; VEX-LABEL: fptoui_2f32_to_2i64: +; VEX: # %bb.0: +; VEX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm2 +; VEX-NEXT: vcvttss2si %xmm2, %rax +; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; VEX-NEXT: xorq %rcx, %rax +; VEX-NEXT: vcvttss2si %xmm0, %rdx +; VEX-NEXT: vucomiss %xmm1, %xmm0 +; VEX-NEXT: cmovaeq %rax, %rdx +; VEX-NEXT: vmovq %rdx, %xmm2 +; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm3 +; VEX-NEXT: vcvttss2si %xmm3, %rax +; VEX-NEXT: xorq %rcx, %rax +; VEX-NEXT: vcvttss2si %xmm0, %rcx +; VEX-NEXT: vucomiss %xmm1, %xmm0 +; VEX-NEXT: cmovaeq %rax, %rcx +; VEX-NEXT: vmovq %rcx, %xmm0 +; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; VEX-NEXT: retq +; +; AVX512F-LABEL: fptoui_2f32_to_2i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vcvttss2usi %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512F-NEXT: vcvttss2usi %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_2f32_to_2i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm1 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_2f32_to_2i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_2f32_to_2i64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %shuf = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> + %cvt = fptoui <2 x float> %shuf to <2 x i64> + ret <2 x i64> %cvt +} + +define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) { +; SSE-LABEL: fptoui_4f32_to_2i64: +; SSE: # %bb.0: +; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: subss %xmm2, %xmm1 +; SSE-NEXT: cvttss2si %xmm1, %rax +; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; SSE-NEXT: xorq %rcx, %rax +; SSE-NEXT: cvttss2si %xmm0, %rdx +; SSE-NEXT: ucomiss %xmm2, %xmm0 +; SSE-NEXT: cmovaeq %rax, %rdx +; SSE-NEXT: movq %rdx, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: subss %xmm2, %xmm3 +; SSE-NEXT: cvttss2si %xmm3, %rax +; SSE-NEXT: xorq %rcx, %rax +; SSE-NEXT: cvttss2si %xmm0, %rcx +; SSE-NEXT: ucomiss %xmm2, %xmm0 +; SSE-NEXT: cmovaeq %rax, %rcx +; SSE-NEXT: movq %rcx, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; VEX-LABEL: fptoui_4f32_to_2i64: +; VEX: # %bb.0: +; VEX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; VEX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; VEX-NEXT: vsubss %xmm2, %xmm1, %xmm3 +; VEX-NEXT: vcvttss2si %xmm3, %rax +; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; VEX-NEXT: xorq %rcx, %rax +; VEX-NEXT: vcvttss2si %xmm1, %rdx +; VEX-NEXT: vucomiss %xmm2, %xmm1 +; VEX-NEXT: cmovaeq %rax, %rdx +; VEX-NEXT: vsubss %xmm2, %xmm0, %xmm1 +; VEX-NEXT: vcvttss2si %xmm1, %rax +; VEX-NEXT: xorq %rcx, %rax +; VEX-NEXT: vcvttss2si %xmm0, %rcx +; VEX-NEXT: vucomiss %xmm2, %xmm0 +; VEX-NEXT: cmovaeq %rax, %rcx +; VEX-NEXT: vmovq %rcx, %xmm0 +; VEX-NEXT: vmovq %rdx, %xmm1 +; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; VEX-NEXT: retq +; +; AVX512F-LABEL: fptoui_4f32_to_2i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512F-NEXT: vcvttss2usi %xmm1, %rax +; AVX512F-NEXT: vcvttss2usi %xmm0, %rcx +; AVX512F-NEXT: vmovq %rcx, %xmm0 +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_4f32_to_2i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvttss2usi %xmm1, %rax +; AVX512VL-NEXT: vcvttss2usi %xmm0, %rcx +; AVX512VL-NEXT: vmovq %rcx, %xmm0 +; AVX512VL-NEXT: vmovq %rax, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_4f32_to_2i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_4f32_to_2i64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %ymm0 +; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq + %cvt = fptoui <4 x float> %a to <4 x i64> + %shuf = shufflevector <4 x i64> %cvt, <4 x i64> undef, <2 x i32> + ret <2 x i64> %shuf +} + +define <8 x i32> @fptoui_8f32_to_8i32(<8 x float> %a) { +; SSE-LABEL: fptoui_8f32_to_8i32: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm4 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: cmpltps %xmm4, %xmm2 +; SSE-NEXT: cvttps2dq %xmm0, %xmm3 +; SSE-NEXT: subps %xmm4, %xmm0 +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: movaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] +; SSE-NEXT: xorps %xmm5, %xmm0 +; SSE-NEXT: andps %xmm2, %xmm3 +; SSE-NEXT: andnps %xmm0, %xmm2 +; SSE-NEXT: orps %xmm3, %xmm2 +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: cmpltps %xmm4, %xmm3 +; SSE-NEXT: cvttps2dq %xmm1, %xmm0 +; SSE-NEXT: subps %xmm4, %xmm1 +; SSE-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE-NEXT: xorps %xmm5, %xmm1 +; SSE-NEXT: andps %xmm3, %xmm0 +; SSE-NEXT: andnps %xmm1, %xmm3 +; SSE-NEXT: orps %xmm0, %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: fptoui_8f32_to_8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; AVX1-NEXT: vcmpltps %ymm1, %ymm0, %ymm2 +; AVX1-NEXT: vsubps %ymm1, %ymm0, %ymm1 +; AVX1-NEXT: vcvttps2dq %ymm1, %ymm1 +; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_8f32_to_8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; AVX2-NEXT: vcmpltps %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vsubps %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vxorps %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptoui_8f32_to_8i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_8f32_to_8i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttps2udq %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_8f32_to_8i32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_8f32_to_8i32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttps2udq %ymm0, %ymm0 +; AVX512VLDQ-NEXT: retq + %cvt = fptoui <8 x float> %a to <8 x i32> + ret <8 x i32> %cvt +} + +define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) { +; SSE-LABEL: fptoui_4f32_to_4i64: +; SSE: # %bb.0: +; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: subss %xmm1, %xmm2 +; SSE-NEXT: cvttss2si %xmm2, %rcx +; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: cvttss2si %xmm0, %rdx +; SSE-NEXT: ucomiss %xmm1, %xmm0 +; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: movq %rdx, %xmm2 +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: subss %xmm1, %xmm4 +; SSE-NEXT: cvttss2si %xmm4, %rcx +; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: cvttss2si %xmm3, %rdx +; SSE-NEXT: ucomiss %xmm1, %xmm3 +; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: movq %rdx, %xmm3 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: subss %xmm1, %xmm4 +; SSE-NEXT: cvttss2si %xmm4, %rcx +; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: cvttss2si %xmm3, %rdx +; SSE-NEXT: ucomiss %xmm1, %xmm3 +; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: movq %rdx, %xmm3 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: subss %xmm1, %xmm4 +; SSE-NEXT: cvttss2si %xmm4, %rcx +; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: ucomiss %xmm1, %xmm0 +; SSE-NEXT: cmovaeq %rcx, %rax +; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: fptoui_4f32_to_4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vsubss %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vcvttss2si %xmm3, %rax +; AVX1-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttss2si %xmm2, %rdx +; AVX1-NEXT: vucomiss %xmm1, %xmm2 +; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm2 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX1-NEXT: vsubss %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vcvttss2si %xmm4, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttss2si %xmm3, %rdx +; AVX1-NEXT: vucomiss %xmm1, %xmm3 +; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vcvttss2si %xmm3, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttss2si %xmm0, %rdx +; AVX1-NEXT: vucomiss %xmm1, %xmm0 +; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm4 +; AVX1-NEXT: vcvttss2si %xmm4, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttss2si %xmm0, %rcx +; AVX1-NEXT: vucomiss %xmm1, %xmm0 +; AVX1-NEXT: cmovaeq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_4f32_to_4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] +; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3 +; AVX2-NEXT: vcvttss2si %xmm3, %rax +; AVX2-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttss2si %xmm2, %rdx +; AVX2-NEXT: vucomiss %xmm1, %xmm2 +; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm2 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX2-NEXT: vsubss %xmm1, %xmm3, %xmm4 +; AVX2-NEXT: vcvttss2si %xmm4, %rax +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttss2si %xmm3, %rdx +; AVX2-NEXT: vucomiss %xmm1, %xmm3 +; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm3 +; AVX2-NEXT: vcvttss2si %xmm3, %rax +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttss2si %xmm0, %rdx +; AVX2-NEXT: vucomiss %xmm1, %xmm0 +; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm3 +; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm4 +; AVX2-NEXT: vcvttss2si %xmm4, %rax +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttss2si %xmm0, %rcx +; AVX2-NEXT: vucomiss %xmm1, %xmm0 +; AVX2-NEXT: cmovaeq %rax, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptoui_4f32_to_4i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX512F-NEXT: vcvttss2usi %xmm1, %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512F-NEXT: vcvttss2usi %xmm2, %rax +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vcvttss2usi %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512F-NEXT: vcvttss2usi %xmm0, %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_4f32_to_4i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX512VL-NEXT: vcvttss2usi %xmm1, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm1 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512VL-NEXT: vcvttss2usi %xmm2, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm2 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm2 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_4f32_to_4i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_4f32_to_4i64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %ymm0 +; AVX512VLDQ-NEXT: retq + %shuf = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> + %cvt = fptoui <4 x float> %shuf to <4 x i64> + ret <4 x i64> %cvt +} + +define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) { +; SSE-LABEL: fptoui_8f32_to_4i64: +; SSE: # %bb.0: +; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: subss %xmm1, %xmm2 +; SSE-NEXT: cvttss2si %xmm2, %rcx +; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: cvttss2si %xmm0, %rdx +; SSE-NEXT: ucomiss %xmm1, %xmm0 +; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: movq %rdx, %xmm2 +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: subss %xmm1, %xmm4 +; SSE-NEXT: cvttss2si %xmm4, %rcx +; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: cvttss2si %xmm3, %rdx +; SSE-NEXT: ucomiss %xmm1, %xmm3 +; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: movq %rdx, %xmm3 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: subss %xmm1, %xmm4 +; SSE-NEXT: cvttss2si %xmm4, %rcx +; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: cvttss2si %xmm3, %rdx +; SSE-NEXT: ucomiss %xmm1, %xmm3 +; SSE-NEXT: cmovaeq %rcx, %rdx +; SSE-NEXT: movq %rdx, %xmm3 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: subss %xmm1, %xmm4 +; SSE-NEXT: cvttss2si %xmm4, %rcx +; SSE-NEXT: xorq %rax, %rcx +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: ucomiss %xmm1, %xmm0 +; SSE-NEXT: cmovaeq %rcx, %rax +; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: fptoui_8f32_to_4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vsubss %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vcvttss2si %xmm3, %rax +; AVX1-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttss2si %xmm2, %rdx +; AVX1-NEXT: vucomiss %xmm1, %xmm2 +; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm2 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX1-NEXT: vsubss %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vcvttss2si %xmm4, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttss2si %xmm3, %rdx +; AVX1-NEXT: vucomiss %xmm1, %xmm3 +; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vcvttss2si %xmm3, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttss2si %xmm0, %rdx +; AVX1-NEXT: vucomiss %xmm1, %xmm0 +; AVX1-NEXT: cmovaeq %rax, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm3 +; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm4 +; AVX1-NEXT: vcvttss2si %xmm4, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vcvttss2si %xmm0, %rcx +; AVX1-NEXT: vucomiss %xmm1, %xmm0 +; AVX1-NEXT: cmovaeq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_8f32_to_4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] +; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3 +; AVX2-NEXT: vcvttss2si %xmm3, %rax +; AVX2-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttss2si %xmm2, %rdx +; AVX2-NEXT: vucomiss %xmm1, %xmm2 +; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm2 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX2-NEXT: vsubss %xmm1, %xmm3, %xmm4 +; AVX2-NEXT: vcvttss2si %xmm4, %rax +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttss2si %xmm3, %rdx +; AVX2-NEXT: vucomiss %xmm1, %xmm3 +; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm3 +; AVX2-NEXT: vcvttss2si %xmm3, %rax +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttss2si %xmm0, %rdx +; AVX2-NEXT: vucomiss %xmm1, %xmm0 +; AVX2-NEXT: cmovaeq %rax, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm3 +; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm4 +; AVX2-NEXT: vcvttss2si %xmm4, %rax +; AVX2-NEXT: xorq %rcx, %rax +; AVX2-NEXT: vcvttss2si %xmm0, %rcx +; AVX2-NEXT: vucomiss %xmm1, %xmm0 +; AVX2-NEXT: cmovaeq %rax, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptoui_8f32_to_4i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512F-NEXT: vcvttss2usi %xmm1, %rax +; AVX512F-NEXT: vcvttss2usi %xmm0, %rcx +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512F-NEXT: vcvttss2usi %xmm1, %rdx +; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512F-NEXT: vcvttss2usi %xmm0, %rsi +; AVX512F-NEXT: vmovq %rsi, %xmm0 +; AVX512F-NEXT: vmovq %rdx, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vmovq %rcx, %xmm1 +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_8f32_to_4i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvttss2usi %xmm1, %rax +; AVX512VL-NEXT: vcvttss2usi %xmm0, %rcx +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512VL-NEXT: vcvttss2usi %xmm1, %rdx +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512VL-NEXT: vcvttss2usi %xmm0, %rsi +; AVX512VL-NEXT: vmovq %rsi, %xmm0 +; AVX512VL-NEXT: vmovq %rdx, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: vmovq %rcx, %xmm1 +; AVX512VL-NEXT: vmovq %rax, %xmm2 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_8f32_to_4i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_8f32_to_4i64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttps2uqq %ymm0, %zmm0 +; AVX512VLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512VLDQ-NEXT: retq + %cvt = fptoui <8 x float> %a to <8 x i64> + %shuf = shufflevector <8 x i64> %cvt, <8 x i64> undef, <4 x i32> + ret <4 x i64> %shuf +} + +; +; Constant Folding +; + +define <2 x i64> @fptosi_2f64_to_2i64_const() { +; SSE-LABEL: fptosi_2f64_to_2i64_const: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,18446744073709551615] +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_2f64_to_2i64_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,18446744073709551615] +; AVX-NEXT: retq + %cvt = fptosi <2 x double> to <2 x i64> + ret <2 x i64> %cvt +} + +define <4 x i32> @fptosi_2f64_to_2i32_const() { +; SSE-LABEL: fptosi_2f64_to_2i32_const: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = <4294967295,1,u,u> +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_2f64_to_2i32_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <4294967295,1,u,u> +; AVX-NEXT: retq + %cvt = fptosi <2 x double> to <2 x i32> + %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> + ret <4 x i32> %ext +} + +define <4 x i64> @fptosi_4f64_to_4i64_const() { +; SSE-LABEL: fptosi_4f64_to_4i64_const: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,18446744073709551615] +; SSE-NEXT: movaps {{.*#+}} xmm1 = [2,18446744073709551613] +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_4f64_to_4i64_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,18446744073709551613] +; AVX-NEXT: retq + %cvt = fptosi <4 x double> to <4 x i64> + ret <4 x i64> %cvt +} + +define <4 x i32> @fptosi_4f64_to_4i32_const() { +; SSE-LABEL: fptosi_4f64_to_4i32_const: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3] +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_4f64_to_4i32_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3] +; AVX-NEXT: retq + %cvt = fptosi <4 x double> to <4 x i32> + ret <4 x i32> %cvt +} + +define <2 x i64> @fptoui_2f64_to_2i64_const() { +; SSE-LABEL: fptoui_2f64_to_2i64_const: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [2,4] +; SSE-NEXT: retq +; +; AVX-LABEL: fptoui_2f64_to_2i64_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [2,4] +; AVX-NEXT: retq + %cvt = fptoui <2 x double> to <2 x i64> + ret <2 x i64> %cvt +} + +define <4 x i32> @fptoui_2f64_to_2i32_const(<2 x double> %a) { +; SSE-LABEL: fptoui_2f64_to_2i32_const: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = <2,4,u,u> +; SSE-NEXT: retq +; +; AVX-LABEL: fptoui_2f64_to_2i32_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <2,4,u,u> +; AVX-NEXT: retq + %cvt = fptoui <2 x double> to <2 x i32> + %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> + ret <4 x i32> %ext +} + +define <4 x i64> @fptoui_4f64_to_4i64_const(<4 x double> %a) { +; SSE-LABEL: fptoui_4f64_to_4i64_const: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [2,4] +; SSE-NEXT: movaps {{.*#+}} xmm1 = [6,8] +; SSE-NEXT: retq +; +; AVX-LABEL: fptoui_4f64_to_4i64_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [2,4,6,8] +; AVX-NEXT: retq + %cvt = fptoui <4 x double> to <4 x i64> + ret <4 x i64> %cvt +} + +define <4 x i32> @fptoui_4f64_to_4i32_const(<4 x double> %a) { +; SSE-LABEL: fptoui_4f64_to_4i32_const: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [2,4,6,8] +; SSE-NEXT: retq +; +; AVX-LABEL: fptoui_4f64_to_4i32_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [2,4,6,8] +; AVX-NEXT: retq + %cvt = fptoui <4 x double> to <4 x i32> + ret <4 x i32> %cvt +} + +define <4 x i32> @fptosi_4f32_to_4i32_const() { +; SSE-LABEL: fptosi_4f32_to_4i32_const: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,4294967295,2,3] +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_4f32_to_4i32_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,4294967295,2,3] +; AVX-NEXT: retq + %cvt = fptosi <4 x float> to <4 x i32> + ret <4 x i32> %cvt +} + +define <4 x i64> @fptosi_4f32_to_4i64_const() { +; SSE-LABEL: fptosi_4f32_to_4i64_const: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,18446744073709551615] +; SSE-NEXT: movaps {{.*#+}} xmm1 = [2,3] +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_4f32_to_4i64_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,3] +; AVX-NEXT: retq + %cvt = fptosi <4 x float> to <4 x i64> + ret <4 x i64> %cvt +} + +define <8 x i32> @fptosi_8f32_to_8i32_const(<8 x float> %a) { +; SSE-LABEL: fptosi_8f32_to_8i32_const: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,4294967295,2,3] +; SSE-NEXT: movaps {{.*#+}} xmm1 = [6,4294967288,2,4294967295] +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_8f32_to_8i32_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,4294967295,2,3,6,4294967288,2,4294967295] +; AVX-NEXT: retq + %cvt = fptosi <8 x float> to <8 x i32> + ret <8 x i32> %cvt +} + +define <4 x i32> @fptoui_4f32_to_4i32_const(<4 x float> %a) { +; SSE-LABEL: fptoui_4f32_to_4i32_const: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,2,4,6] +; SSE-NEXT: retq +; +; AVX-LABEL: fptoui_4f32_to_4i32_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,2,4,6] +; AVX-NEXT: retq + %cvt = fptoui <4 x float> to <4 x i32> + ret <4 x i32> %cvt +} + +define <4 x i64> @fptoui_4f32_to_4i64_const() { +; SSE-LABEL: fptoui_4f32_to_4i64_const: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,2] +; SSE-NEXT: movaps {{.*#+}} xmm1 = [4,8] +; SSE-NEXT: retq +; +; AVX-LABEL: fptoui_4f32_to_4i64_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8] +; AVX-NEXT: retq + %cvt = fptoui <4 x float> to <4 x i64> + ret <4 x i64> %cvt +} + +define <8 x i32> @fptoui_8f32_to_8i32_const(<8 x float> %a) { +; SSE-LABEL: fptoui_8f32_to_8i32_const: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,2,4,6] +; SSE-NEXT: movaps {{.*#+}} xmm1 = [8,6,4,1] +; SSE-NEXT: retq +; +; AVX-LABEL: fptoui_8f32_to_8i32_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,6,8,6,4,1] +; AVX-NEXT: retq + %cvt = fptoui <8 x float> to <8 x i32> + ret <8 x i32> %cvt +} + +; +; Special Cases +; + +define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind { +; SSE-LABEL: fptosi_2f16_to_4i32: +; SSE: # %bb.0: +; SSE-NEXT: pushq %rax +; SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: callq __gnu_f2h_ieee +; SSE-NEXT: movzwl %ax, %edi +; SSE-NEXT: callq __gnu_h2f_ieee +; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: callq __gnu_f2h_ieee +; SSE-NEXT: movzwl %ax, %edi +; SSE-NEXT: callq __gnu_h2f_ieee +; SSE-NEXT: cvttss2si %xmm0, %eax +; SSE-NEXT: cvttss2si (%rsp), %ecx # 4-byte Folded Reload +; SSE-NEXT: movd %ecx, %xmm0 +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero +; SSE-NEXT: popq %rax +; SSE-NEXT: retq +; +; VEX-LABEL: fptosi_2f16_to_4i32: +; VEX: # %bb.0: +; VEX-NEXT: pushq %rax +; VEX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; VEX-NEXT: vmovaps %xmm1, %xmm0 +; VEX-NEXT: callq __gnu_f2h_ieee +; VEX-NEXT: movzwl %ax, %edi +; VEX-NEXT: callq __gnu_h2f_ieee +; VEX-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill +; VEX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; VEX-NEXT: # xmm0 = mem[0],zero,zero,zero +; VEX-NEXT: callq __gnu_f2h_ieee +; VEX-NEXT: movzwl %ax, %edi +; VEX-NEXT: callq __gnu_h2f_ieee +; VEX-NEXT: vcvttss2si %xmm0, %eax +; VEX-NEXT: vcvttss2si (%rsp), %ecx # 4-byte Folded Reload +; VEX-NEXT: vmovd %ecx, %xmm0 +; VEX-NEXT: vmovd %eax, %xmm1 +; VEX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; VEX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; VEX-NEXT: popq %rax +; VEX-NEXT: retq +; +; AVX512-LABEL: fptosi_2f16_to_4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vcvttss2si %xmm0, %eax +; AVX512-NEXT: vcvttss2si %xmm1, %ecx +; AVX512-NEXT: vmovd %ecx, %xmm0 +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512-NEXT: retq + %cvt = fptosi <2 x half> %a to <2 x i32> + %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %ext +} + +define <4 x i32> @fptosi_2f80_to_4i32(<2 x x86_fp80> %a) nounwind { +; SSE-LABEL: fptosi_2f80_to_4i32: +; SSE: # %bb.0: +; SSE-NEXT: fldt {{[0-9]+}}(%rsp) +; SSE-NEXT: fldt {{[0-9]+}}(%rsp) +; SSE-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: movw $3199, -{{[0-9]+}}(%rsp) # imm = 0xC7F +; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp) +; SSE-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; SSE-NEXT: fistpl -{{[0-9]+}}(%rsp) +; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp) +; SSE-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: movw $3199, -{{[0-9]+}}(%rsp) # imm = 0xC7F +; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp) +; SSE-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; SSE-NEXT: fistpl -{{[0-9]+}}(%rsp) +; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp) +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_2f80_to_4i32: +; AVX: # %bb.0: +; AVX-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX-NEXT: fisttpl -{{[0-9]+}}(%rsp) +; AVX-NEXT: fisttpl -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX-NEXT: retq + %cvt = fptosi <2 x x86_fp80> %a to <2 x i32> + %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %ext +} + +define <4 x i32> @fptosi_2f128_to_4i32(<2 x fp128> %a) nounwind { +; SSE-LABEL: fptosi_2f128_to_4i32: +; SSE: # %bb.0: +; SSE-NEXT: pushq %rbp +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: movq %rcx, %r14 +; SSE-NEXT: movq %rdx, %rbx +; SSE-NEXT: callq __fixtfsi +; SSE-NEXT: movl %eax, %ebp +; SSE-NEXT: movq %rbx, %rdi +; SSE-NEXT: movq %r14, %rsi +; SSE-NEXT: callq __fixtfsi +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: movd %ebp, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %rbp +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_2f128_to_4i32: +; AVX: # %bb.0: +; AVX-NEXT: pushq %rbp +; AVX-NEXT: pushq %r14 +; AVX-NEXT: pushq %rbx +; AVX-NEXT: movq %rcx, %r14 +; AVX-NEXT: movq %rdx, %rbx +; AVX-NEXT: callq __fixtfsi +; AVX-NEXT: movl %eax, %ebp +; AVX-NEXT: movq %rbx, %rdi +; AVX-NEXT: movq %r14, %rsi +; AVX-NEXT: callq __fixtfsi +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vmovd %ebp, %xmm1 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX-NEXT: popq %rbx +; AVX-NEXT: popq %r14 +; AVX-NEXT: popq %rbp +; AVX-NEXT: retq + %cvt = fptosi <2 x fp128> %a to <2 x i32> + %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %ext +} + +define <2 x i8> @fptosi_2f32_to_2i8(<2 x float> %a) { +; SSE-LABEL: fptosi_2f32_to_2i8: +; SSE: # %bb.0: +; SSE-NEXT: cvttss2si %xmm0, %eax +; SSE-NEXT: movzbl %al, %eax +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-NEXT: cvttss2si %xmm0, %ecx +; SSE-NEXT: shll $8, %ecx +; SSE-NEXT: orl %eax, %ecx +; SSE-NEXT: movd %ecx, %xmm0 +; SSE-NEXT: retq +; +; VEX-LABEL: fptosi_2f32_to_2i8: +; VEX: # %bb.0: +; VEX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; VEX-NEXT: vcvttss2si %xmm1, %eax +; VEX-NEXT: vcvttss2si %xmm0, %ecx +; VEX-NEXT: vmovd %ecx, %xmm0 +; VEX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; VEX-NEXT: retq +; +; AVX512-LABEL: fptosi_2f32_to_2i8: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512-NEXT: vcvttps2dq %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %cvt = fptosi <2 x float> %a to <2 x i8> + ret <2 x i8> %cvt +} + +define <2 x i16> @fptosi_2f32_to_2i16(<2 x float> %a) { +; SSE-LABEL: fptosi_2f32_to_2i16: +; SSE: # %bb.0: +; SSE-NEXT: cvttss2si %xmm0, %eax +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-NEXT: cvttss2si %xmm0, %ecx +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pinsrw $1, %ecx, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: fptosi_2f32_to_2i16: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptosi_2f32_to_2i16: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptosi_2f32_to_2i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512F-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptosi_2f32_to_2i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptosi_2f32_to_2i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512DQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptosi_2f32_to_2i16: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VLDQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512VLDQ-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq + %cvt = fptosi <2 x float> %a to <2 x i16> + ret <2 x i16> %cvt +} + +define <2 x i8> @fptoui_2f32_to_2i8(<2 x float> %a) { +; SSE-LABEL: fptoui_2f32_to_2i8: +; SSE: # %bb.0: +; SSE-NEXT: cvttss2si %xmm0, %eax +; SSE-NEXT: movzbl %al, %eax +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-NEXT: cvttss2si %xmm0, %ecx +; SSE-NEXT: shll $8, %ecx +; SSE-NEXT: orl %eax, %ecx +; SSE-NEXT: movd %ecx, %xmm0 +; SSE-NEXT: retq +; +; VEX-LABEL: fptoui_2f32_to_2i8: +; VEX: # %bb.0: +; VEX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; VEX-NEXT: vcvttss2si %xmm1, %eax +; VEX-NEXT: vcvttss2si %xmm0, %ecx +; VEX-NEXT: vmovd %ecx, %xmm0 +; VEX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; VEX-NEXT: retq +; +; AVX512-LABEL: fptoui_2f32_to_2i8: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512-NEXT: vcvttps2dq %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %cvt = fptoui <2 x float> %a to <2 x i8> + ret <2 x i8> %cvt +} + +define <2 x i16> @fptoui_2f32_to_2i16(<2 x float> %a) { +; SSE-LABEL: fptoui_2f32_to_2i16: +; SSE: # %bb.0: +; SSE-NEXT: cvttss2si %xmm0, %eax +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-NEXT: cvttss2si %xmm0, %ecx +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pinsrw $1, %ecx, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: fptoui_2f32_to_2i16: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_2f32_to_2i16: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptoui_2f32_to_2i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512F-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_2f32_to_2i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_2f32_to_2i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512DQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_2f32_to_2i16: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VLDQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512VLDQ-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq + %cvt = fptoui <2 x float> %a to <2 x i16> + ret <2 x i16> %cvt +} + +define <2 x i8> @fptosi_2f64_to_2i8(<2 x double> %a) { +; SSE-LABEL: fptosi_2f64_to_2i8: +; SSE: # %bb.0: +; SSE-NEXT: cvttsd2si %xmm0, %eax +; SSE-NEXT: movzbl %al, %eax +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttsd2si %xmm0, %ecx +; SSE-NEXT: shll $8, %ecx +; SSE-NEXT: orl %eax, %ecx +; SSE-NEXT: movd %ecx, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: fptosi_2f64_to_2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vcvttsd2si %xmm1, %eax +; AVX-NEXT: vcvttsd2si %xmm0, %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: retq + %cvt = fptosi <2 x double> %a to <2 x i8> + ret <2 x i8> %cvt +} + +define <2 x i16> @fptosi_2f64_to_2i16(<2 x double> %a) { +; SSE-LABEL: fptosi_2f64_to_2i16: +; SSE: # %bb.0: +; SSE-NEXT: cvttsd2si %xmm0, %eax +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttsd2si %xmm0, %ecx +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pinsrw $1, %ecx, %xmm0 +; SSE-NEXT: retq +; +; VEX-LABEL: fptosi_2f64_to_2i16: +; VEX: # %bb.0: +; VEX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; VEX-NEXT: vcvttsd2si %xmm1, %eax +; VEX-NEXT: vcvttsd2si %xmm0, %ecx +; VEX-NEXT: vmovd %ecx, %xmm0 +; VEX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; VEX-NEXT: retq +; +; AVX512F-LABEL: fptosi_2f64_to_2i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptosi_2f64_to_2i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptosi_2f64_to_2i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptosi_2f64_to_2i16: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512VLDQ-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq + %cvt = fptosi <2 x double> %a to <2 x i16> + ret <2 x i16> %cvt +} + +define <2 x i8> @fptoui_2f64_to_2i8(<2 x double> %a) { +; SSE-LABEL: fptoui_2f64_to_2i8: +; SSE: # %bb.0: +; SSE-NEXT: cvttsd2si %xmm0, %eax +; SSE-NEXT: movzbl %al, %eax +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttsd2si %xmm0, %ecx +; SSE-NEXT: shll $8, %ecx +; SSE-NEXT: orl %eax, %ecx +; SSE-NEXT: movd %ecx, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: fptoui_2f64_to_2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vcvttsd2si %xmm1, %eax +; AVX-NEXT: vcvttsd2si %xmm0, %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: retq + %cvt = fptoui <2 x double> %a to <2 x i8> + ret <2 x i8> %cvt +} + +define <2 x i16> @fptoui_2f64_to_2i16(<2 x double> %a) { +; SSE-LABEL: fptoui_2f64_to_2i16: +; SSE: # %bb.0: +; SSE-NEXT: cvttsd2si %xmm0, %eax +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttsd2si %xmm0, %ecx +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pinsrw $1, %ecx, %xmm0 +; SSE-NEXT: retq +; +; VEX-LABEL: fptoui_2f64_to_2i16: +; VEX: # %bb.0: +; VEX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; VEX-NEXT: vcvttsd2si %xmm1, %eax +; VEX-NEXT: vcvttsd2si %xmm0, %ecx +; VEX-NEXT: vmovd %ecx, %xmm0 +; VEX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; VEX-NEXT: retq +; +; AVX512F-LABEL: fptoui_2f64_to_2i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_2f64_to_2i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_2f64_to_2i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_2f64_to_2i16: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512VLDQ-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq + %cvt = fptoui <2 x double> %a to <2 x i16> + ret <2 x i16> %cvt +} diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll index 651c0e65aa05..fd8e4c41b253 100644 --- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -6,8 +6,6 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512DQ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VLDQ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=WIDEN --check-prefix=WIDEN_SKX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=WIDEN --check-prefix=WIDEN_KNL ; ; 32-bit tests to make sure we're not doing anything stupid. ; RUN: llc < %s -mtriple=i686-unknown-unknown @@ -72,21 +70,6 @@ define <2 x i64> @fptosi_2f64_to_2i64(<2 x double> %a) { ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttpd2qq %xmm0, %xmm0 ; AVX512VLDQ-NEXT: retq -; -; WIDEN_SKX-LABEL: fptosi_2f64_to_2i64: -; WIDEN_SKX: # %bb.0: -; WIDEN_SKX-NEXT: vcvttpd2qq %xmm0, %xmm0 -; WIDEN_SKX-NEXT: retq -; -; WIDEN_KNL-LABEL: fptosi_2f64_to_2i64: -; WIDEN_KNL: # %bb.0: -; WIDEN_KNL-NEXT: vcvttsd2si %xmm0, %rax -; WIDEN_KNL-NEXT: vmovq %rax, %xmm1 -; WIDEN_KNL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; WIDEN_KNL-NEXT: vcvttsd2si %xmm0, %rax -; WIDEN_KNL-NEXT: vmovq %rax, %xmm0 -; WIDEN_KNL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; WIDEN_KNL-NEXT: retq %cvt = fptosi <2 x double> %a to <2 x i64> ret <2 x i64> %cvt } @@ -101,11 +84,6 @@ define <4 x i32> @fptosi_2f64_to_4i32(<2 x double> %a) { ; AVX: # %bb.0: ; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 ; AVX-NEXT: retq -; -; WIDEN-LABEL: fptosi_2f64_to_4i32: -; WIDEN: # %bb.0: -; WIDEN-NEXT: vcvttpd2dq %xmm0, %xmm0 -; WIDEN-NEXT: retq %cvt = fptosi <2 x double> %a to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %ext @@ -123,11 +101,6 @@ define <2 x i32> @fptosi_2f64_to_2i32(<2 x double> %a) { ; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: retq -; -; WIDEN-LABEL: fptosi_2f64_to_2i32: -; WIDEN: # %bb.0: -; WIDEN-NEXT: vcvttpd2dq %xmm0, %xmm0 -; WIDEN-NEXT: retq %cvt = fptosi <2 x double> %a to <2 x i32> ret <2 x i32> %cvt } @@ -146,13 +119,6 @@ define <4 x i32> @fptosi_4f64_to_2i32(<2 x double> %a) { ; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq -; -; WIDEN-LABEL: fptosi_4f64_to_2i32: -; WIDEN: # %bb.0: -; WIDEN-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; WIDEN-NEXT: vcvttpd2dq %ymm0, %xmm0 -; WIDEN-NEXT: vzeroupper -; WIDEN-NEXT: retq %ext = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> %cvt = fptosi <4 x double> %ext to <4 x i32> ret <4 x i32> %cvt @@ -260,29 +226,6 @@ define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) { ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttpd2qq %ymm0, %ymm0 ; AVX512VLDQ-NEXT: retq -; -; WIDEN_SKX-LABEL: fptosi_4f64_to_4i64: -; WIDEN_SKX: # %bb.0: -; WIDEN_SKX-NEXT: vcvttpd2qq %ymm0, %ymm0 -; WIDEN_SKX-NEXT: retq -; -; WIDEN_KNL-LABEL: fptosi_4f64_to_4i64: -; WIDEN_KNL: # %bb.0: -; WIDEN_KNL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; WIDEN_KNL-NEXT: vcvttsd2si %xmm1, %rax -; WIDEN_KNL-NEXT: vmovq %rax, %xmm2 -; WIDEN_KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; WIDEN_KNL-NEXT: vcvttsd2si %xmm1, %rax -; WIDEN_KNL-NEXT: vmovq %rax, %xmm1 -; WIDEN_KNL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; WIDEN_KNL-NEXT: vcvttsd2si %xmm0, %rax -; WIDEN_KNL-NEXT: vmovq %rax, %xmm2 -; WIDEN_KNL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; WIDEN_KNL-NEXT: vcvttsd2si %xmm0, %rax -; WIDEN_KNL-NEXT: vmovq %rax, %xmm0 -; WIDEN_KNL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; WIDEN_KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; WIDEN_KNL-NEXT: retq %cvt = fptosi <4 x double> %a to <4 x i64> ret <4 x i64> %cvt } @@ -300,12 +243,6 @@ define <4 x i32> @fptosi_4f64_to_4i32(<4 x double> %a) { ; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq -; -; WIDEN-LABEL: fptosi_4f64_to_4i32: -; WIDEN: # %bb.0: -; WIDEN-NEXT: vcvttpd2dq %ymm0, %xmm0 -; WIDEN-NEXT: vzeroupper -; WIDEN-NEXT: retq %cvt = fptosi <4 x double> %a to <4 x i32> ret <4 x i32> %cvt } @@ -394,21 +331,6 @@ define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) { ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttpd2uqq %xmm0, %xmm0 ; AVX512VLDQ-NEXT: retq -; -; WIDEN_SKX-LABEL: fptoui_2f64_to_2i64: -; WIDEN_SKX: # %bb.0: -; WIDEN_SKX-NEXT: vcvttpd2uqq %xmm0, %xmm0 -; WIDEN_SKX-NEXT: retq -; -; WIDEN_KNL-LABEL: fptoui_2f64_to_2i64: -; WIDEN_KNL: # %bb.0: -; WIDEN_KNL-NEXT: vcvttsd2usi %xmm0, %rax -; WIDEN_KNL-NEXT: vmovq %rax, %xmm1 -; WIDEN_KNL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; WIDEN_KNL-NEXT: vcvttsd2usi %xmm0, %rax -; WIDEN_KNL-NEXT: vmovq %rax, %xmm0 -; WIDEN_KNL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; WIDEN_KNL-NEXT: retq %cvt = fptoui <2 x double> %a to <2 x i64> ret <2 x i64> %cvt } @@ -489,19 +411,6 @@ define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) { ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttpd2udq %xmm0, %xmm0 ; AVX512VLDQ-NEXT: retq -; -; WIDEN_SKX-LABEL: fptoui_2f64_to_4i32: -; WIDEN_SKX: # %bb.0: -; WIDEN_SKX-NEXT: vcvttpd2udq %xmm0, %xmm0 -; WIDEN_SKX-NEXT: retq -; -; WIDEN_KNL-LABEL: fptoui_2f64_to_4i32: -; WIDEN_KNL: # %bb.0: -; WIDEN_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; WIDEN_KNL-NEXT: vcvttpd2udq %zmm0, %ymm0 -; WIDEN_KNL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; WIDEN_KNL-NEXT: vzeroupper -; WIDEN_KNL-NEXT: retq %cvt = fptoui <2 x double> %a to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %ext @@ -581,19 +490,6 @@ define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) { ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttpd2udq %xmm0, %xmm0 ; AVX512VLDQ-NEXT: retq -; -; WIDEN_SKX-LABEL: fptoui_2f64_to_2i32: -; WIDEN_SKX: # %bb.0: -; WIDEN_SKX-NEXT: vcvttpd2udq %xmm0, %xmm0 -; WIDEN_SKX-NEXT: retq -; -; WIDEN_KNL-LABEL: fptoui_2f64_to_2i32: -; WIDEN_KNL: # %bb.0: -; WIDEN_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; WIDEN_KNL-NEXT: vcvttpd2udq %zmm0, %ymm0 -; WIDEN_KNL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; WIDEN_KNL-NEXT: vzeroupper -; WIDEN_KNL-NEXT: retq %cvt = fptoui <2 x double> %a to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> ret <4 x i32> %ext @@ -687,21 +583,6 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) { ; AVX512VLDQ-NEXT: vcvttpd2udq %ymm0, %xmm0 ; AVX512VLDQ-NEXT: vzeroupper ; AVX512VLDQ-NEXT: retq -; -; WIDEN_SKX-LABEL: fptoui_4f64_to_2i32: -; WIDEN_SKX: # %bb.0: -; WIDEN_SKX-NEXT: vmovaps %xmm0, %xmm0 -; WIDEN_SKX-NEXT: vcvttpd2udq %ymm0, %xmm0 -; WIDEN_SKX-NEXT: vzeroupper -; WIDEN_SKX-NEXT: retq -; -; WIDEN_KNL-LABEL: fptoui_4f64_to_2i32: -; WIDEN_KNL: # %bb.0: -; WIDEN_KNL-NEXT: vmovaps %xmm0, %xmm0 -; WIDEN_KNL-NEXT: vcvttpd2udq %zmm0, %ymm0 -; WIDEN_KNL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; WIDEN_KNL-NEXT: vzeroupper -; WIDEN_KNL-NEXT: retq %ext = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <4 x i32> %cvt = fptoui <4 x double> %ext to <4 x i32> ret <4 x i32> %cvt @@ -878,29 +759,6 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttpd2uqq %ymm0, %ymm0 ; AVX512VLDQ-NEXT: retq -; -; WIDEN_SKX-LABEL: fptoui_4f64_to_4i64: -; WIDEN_SKX: # %bb.0: -; WIDEN_SKX-NEXT: vcvttpd2uqq %ymm0, %ymm0 -; WIDEN_SKX-NEXT: retq -; -; WIDEN_KNL-LABEL: fptoui_4f64_to_4i64: -; WIDEN_KNL: # %bb.0: -; WIDEN_KNL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; WIDEN_KNL-NEXT: vcvttsd2usi %xmm1, %rax -; WIDEN_KNL-NEXT: vmovq %rax, %xmm2 -; WIDEN_KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; WIDEN_KNL-NEXT: vcvttsd2usi %xmm1, %rax -; WIDEN_KNL-NEXT: vmovq %rax, %xmm1 -; WIDEN_KNL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; WIDEN_KNL-NEXT: vcvttsd2usi %xmm0, %rax -; WIDEN_KNL-NEXT: vmovq %rax, %xmm2 -; WIDEN_KNL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; WIDEN_KNL-NEXT: vcvttsd2usi %xmm0, %rax -; WIDEN_KNL-NEXT: vmovq %rax, %xmm0 -; WIDEN_KNL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; WIDEN_KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; WIDEN_KNL-NEXT: retq %cvt = fptoui <4 x double> %a to <4 x i64> ret <4 x i64> %cvt } @@ -1006,20 +864,6 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) { ; AVX512VLDQ-NEXT: vcvttpd2udq %ymm0, %xmm0 ; AVX512VLDQ-NEXT: vzeroupper ; AVX512VLDQ-NEXT: retq -; -; WIDEN_SKX-LABEL: fptoui_4f64_to_4i32: -; WIDEN_SKX: # %bb.0: -; WIDEN_SKX-NEXT: vcvttpd2udq %ymm0, %xmm0 -; WIDEN_SKX-NEXT: vzeroupper -; WIDEN_SKX-NEXT: retq -; -; WIDEN_KNL-LABEL: fptoui_4f64_to_4i32: -; WIDEN_KNL: # %bb.0: -; WIDEN_KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; WIDEN_KNL-NEXT: vcvttpd2udq %zmm0, %ymm0 -; WIDEN_KNL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; WIDEN_KNL-NEXT: vzeroupper -; WIDEN_KNL-NEXT: retq %cvt = fptoui <4 x double> %a to <4 x i32> ret <4 x i32> %cvt } @@ -1040,11 +884,6 @@ define <2 x i32> @fptosi_2f32_to_2i32(<2 x float> %a) { ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: retq -; -; WIDEN-LABEL: fptosi_2f32_to_2i32: -; WIDEN: # %bb.0: -; WIDEN-NEXT: vcvttps2dq %xmm0, %xmm0 -; WIDEN-NEXT: retq %cvt = fptosi <2 x float> %a to <2 x i32> ret <2 x i32> %cvt } @@ -1059,11 +898,6 @@ define <4 x i32> @fptosi_4f32_to_4i32(<4 x float> %a) { ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 ; AVX-NEXT: retq -; -; WIDEN-LABEL: fptosi_4f32_to_4i32: -; WIDEN: # %bb.0: -; WIDEN-NEXT: vcvttps2dq %xmm0, %xmm0 -; WIDEN-NEXT: retq %cvt = fptosi <4 x float> %a to <4 x i32> ret <4 x i32> %cvt } @@ -1122,21 +956,6 @@ define <2 x i64> @fptosi_2f32_to_2i64(<4 x float> %a) { ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %xmm0 ; AVX512VLDQ-NEXT: retq -; -; WIDEN_SKX-LABEL: fptosi_2f32_to_2i64: -; WIDEN_SKX: # %bb.0: -; WIDEN_SKX-NEXT: vcvttps2qq %xmm0, %xmm0 -; WIDEN_SKX-NEXT: retq -; -; WIDEN_KNL-LABEL: fptosi_2f32_to_2i64: -; WIDEN_KNL: # %bb.0: -; WIDEN_KNL-NEXT: vcvttss2si %xmm0, %rax -; WIDEN_KNL-NEXT: vmovq %rax, %xmm1 -; WIDEN_KNL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; WIDEN_KNL-NEXT: vcvttss2si %xmm0, %rax -; WIDEN_KNL-NEXT: vmovq %rax, %xmm0 -; WIDEN_KNL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; WIDEN_KNL-NEXT: retq %shuf = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> %cvt = fptosi <2 x float> %shuf to <2 x i64> ret <2 x i64> %cvt @@ -1198,23 +1017,6 @@ define <2 x i64> @fptosi_4f32_to_2i64(<4 x float> %a) { ; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512VLDQ-NEXT: vzeroupper ; AVX512VLDQ-NEXT: retq -; -; WIDEN_SKX-LABEL: fptosi_4f32_to_2i64: -; WIDEN_SKX: # %bb.0: -; WIDEN_SKX-NEXT: vcvttps2qq %xmm0, %ymm0 -; WIDEN_SKX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; WIDEN_SKX-NEXT: vzeroupper -; WIDEN_SKX-NEXT: retq -; -; WIDEN_KNL-LABEL: fptosi_4f32_to_2i64: -; WIDEN_KNL: # %bb.0: -; WIDEN_KNL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; WIDEN_KNL-NEXT: vcvttss2si %xmm1, %rax -; WIDEN_KNL-NEXT: vcvttss2si %xmm0, %rcx -; WIDEN_KNL-NEXT: vmovq %rcx, %xmm0 -; WIDEN_KNL-NEXT: vmovq %rax, %xmm1 -; WIDEN_KNL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; WIDEN_KNL-NEXT: retq %cvt = fptosi <4 x float> %a to <4 x i64> %shuf = shufflevector <4 x i64> %cvt, <4 x i64> undef, <2 x i32> ret <2 x i64> %shuf @@ -1231,11 +1033,6 @@ define <8 x i32> @fptosi_8f32_to_8i32(<8 x float> %a) { ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %ymm0, %ymm0 ; AVX-NEXT: retq -; -; WIDEN-LABEL: fptosi_8f32_to_8i32: -; WIDEN: # %bb.0: -; WIDEN-NEXT: vcvttps2dq %ymm0, %ymm0 -; WIDEN-NEXT: retq %cvt = fptosi <8 x float> %a to <8 x i32> ret <8 x i32> %cvt } @@ -1343,29 +1140,6 @@ define <4 x i64> @fptosi_4f32_to_4i64(<8 x float> %a) { ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %ymm0 ; AVX512VLDQ-NEXT: retq -; -; WIDEN_SKX-LABEL: fptosi_4f32_to_4i64: -; WIDEN_SKX: # %bb.0: -; WIDEN_SKX-NEXT: vcvttps2qq %xmm0, %ymm0 -; WIDEN_SKX-NEXT: retq -; -; WIDEN_KNL-LABEL: fptosi_4f32_to_4i64: -; WIDEN_KNL: # %bb.0: -; WIDEN_KNL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; WIDEN_KNL-NEXT: vcvttss2si %xmm1, %rax -; WIDEN_KNL-NEXT: vmovq %rax, %xmm1 -; WIDEN_KNL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; WIDEN_KNL-NEXT: vcvttss2si %xmm2, %rax -; WIDEN_KNL-NEXT: vmovq %rax, %xmm2 -; WIDEN_KNL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; WIDEN_KNL-NEXT: vcvttss2si %xmm0, %rax -; WIDEN_KNL-NEXT: vmovq %rax, %xmm2 -; WIDEN_KNL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; WIDEN_KNL-NEXT: vcvttss2si %xmm0, %rax -; WIDEN_KNL-NEXT: vmovq %rax, %xmm0 -; WIDEN_KNL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; WIDEN_KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; WIDEN_KNL-NEXT: retq %shuf = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> %cvt = fptosi <4 x float> %shuf to <4 x i64> ret <4 x i64> %cvt @@ -1475,30 +1249,6 @@ define <4 x i64> @fptosi_8f32_to_4i64(<8 x float> %a) { ; AVX512VLDQ-NEXT: vcvttps2qq %ymm0, %zmm0 ; AVX512VLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VLDQ-NEXT: retq -; -; WIDEN_SKX-LABEL: fptosi_8f32_to_4i64: -; WIDEN_SKX: # %bb.0: -; WIDEN_SKX-NEXT: vcvttps2qq %ymm0, %zmm0 -; WIDEN_SKX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; WIDEN_SKX-NEXT: retq -; -; WIDEN_KNL-LABEL: fptosi_8f32_to_4i64: -; WIDEN_KNL: # %bb.0: -; WIDEN_KNL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; WIDEN_KNL-NEXT: vcvttss2si %xmm1, %rax -; WIDEN_KNL-NEXT: vcvttss2si %xmm0, %rcx -; WIDEN_KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; WIDEN_KNL-NEXT: vcvttss2si %xmm1, %rdx -; WIDEN_KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; WIDEN_KNL-NEXT: vcvttss2si %xmm0, %rsi -; WIDEN_KNL-NEXT: vmovq %rsi, %xmm0 -; WIDEN_KNL-NEXT: vmovq %rdx, %xmm1 -; WIDEN_KNL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; WIDEN_KNL-NEXT: vmovq %rcx, %xmm1 -; WIDEN_KNL-NEXT: vmovq %rax, %xmm2 -; WIDEN_KNL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; WIDEN_KNL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; WIDEN_KNL-NEXT: retq %cvt = fptosi <8 x float> %a to <8 x i64> %shuf = shufflevector <8 x i64> %cvt, <8 x i64> undef, <4 x i32> ret <4 x i64> %shuf @@ -1583,19 +1333,6 @@ define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) { ; AVX512VLDQ-NEXT: vcvttps2udq %xmm0, %xmm0 ; AVX512VLDQ-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX512VLDQ-NEXT: retq -; -; WIDEN_SKX-LABEL: fptoui_2f32_to_2i32: -; WIDEN_SKX: # %bb.0: -; WIDEN_SKX-NEXT: vcvttps2udq %xmm0, %xmm0 -; WIDEN_SKX-NEXT: retq -; -; WIDEN_KNL-LABEL: fptoui_2f32_to_2i32: -; WIDEN_KNL: # %bb.0: -; WIDEN_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; WIDEN_KNL-NEXT: vcvttps2udq %zmm0, %zmm0 -; WIDEN_KNL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; WIDEN_KNL-NEXT: vzeroupper -; WIDEN_KNL-NEXT: retq %cvt = fptoui <2 x float> %a to <2 x i32> ret <2 x i32> %cvt } @@ -1664,19 +1401,6 @@ define <4 x i32> @fptoui_4f32_to_4i32(<4 x float> %a) { ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttps2udq %xmm0, %xmm0 ; AVX512VLDQ-NEXT: retq -; -; WIDEN_SKX-LABEL: fptoui_4f32_to_4i32: -; WIDEN_SKX: # %bb.0: -; WIDEN_SKX-NEXT: vcvttps2udq %xmm0, %xmm0 -; WIDEN_SKX-NEXT: retq -; -; WIDEN_KNL-LABEL: fptoui_4f32_to_4i32: -; WIDEN_KNL: # %bb.0: -; WIDEN_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; WIDEN_KNL-NEXT: vcvttps2udq %zmm0, %zmm0 -; WIDEN_KNL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; WIDEN_KNL-NEXT: vzeroupper -; WIDEN_KNL-NEXT: retq %cvt = fptoui <4 x float> %a to <4 x i32> ret <4 x i32> %cvt } @@ -1761,21 +1485,6 @@ define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) { ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %xmm0 ; AVX512VLDQ-NEXT: retq -; -; WIDEN_SKX-LABEL: fptoui_2f32_to_2i64: -; WIDEN_SKX: # %bb.0: -; WIDEN_SKX-NEXT: vcvttps2uqq %xmm0, %xmm0 -; WIDEN_SKX-NEXT: retq -; -; WIDEN_KNL-LABEL: fptoui_2f32_to_2i64: -; WIDEN_KNL: # %bb.0: -; WIDEN_KNL-NEXT: vcvttss2usi %xmm0, %rax -; WIDEN_KNL-NEXT: vmovq %rax, %xmm1 -; WIDEN_KNL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; WIDEN_KNL-NEXT: vcvttss2usi %xmm0, %rax -; WIDEN_KNL-NEXT: vmovq %rax, %xmm0 -; WIDEN_KNL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; WIDEN_KNL-NEXT: retq %shuf = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> %cvt = fptoui <2 x float> %shuf to <2 x i64> ret <2 x i64> %cvt @@ -1863,23 +1572,6 @@ define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) { ; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512VLDQ-NEXT: vzeroupper ; AVX512VLDQ-NEXT: retq -; -; WIDEN_SKX-LABEL: fptoui_4f32_to_2i64: -; WIDEN_SKX: # %bb.0: -; WIDEN_SKX-NEXT: vcvttps2uqq %xmm0, %ymm0 -; WIDEN_SKX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; WIDEN_SKX-NEXT: vzeroupper -; WIDEN_SKX-NEXT: retq -; -; WIDEN_KNL-LABEL: fptoui_4f32_to_2i64: -; WIDEN_KNL: # %bb.0: -; WIDEN_KNL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; WIDEN_KNL-NEXT: vcvttss2usi %xmm1, %rax -; WIDEN_KNL-NEXT: vcvttss2usi %xmm0, %rcx -; WIDEN_KNL-NEXT: vmovq %rcx, %xmm0 -; WIDEN_KNL-NEXT: vmovq %rax, %xmm1 -; WIDEN_KNL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; WIDEN_KNL-NEXT: retq %cvt = fptoui <4 x float> %a to <4 x i64> %shuf = shufflevector <4 x i64> %cvt, <4 x i64> undef, <2 x i32> ret <2 x i64> %shuf @@ -1958,18 +1650,6 @@ define <8 x i32> @fptoui_8f32_to_8i32(<8 x float> %a) { ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttps2udq %ymm0, %ymm0 ; AVX512VLDQ-NEXT: retq -; -; WIDEN_SKX-LABEL: fptoui_8f32_to_8i32: -; WIDEN_SKX: # %bb.0: -; WIDEN_SKX-NEXT: vcvttps2udq %ymm0, %ymm0 -; WIDEN_SKX-NEXT: retq -; -; WIDEN_KNL-LABEL: fptoui_8f32_to_8i32: -; WIDEN_KNL: # %bb.0: -; WIDEN_KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; WIDEN_KNL-NEXT: vcvttps2udq %zmm0, %zmm0 -; WIDEN_KNL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; WIDEN_KNL-NEXT: retq %cvt = fptoui <8 x float> %a to <8 x i32> ret <8 x i32> %cvt } @@ -2147,29 +1827,6 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) { ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %ymm0 ; AVX512VLDQ-NEXT: retq -; -; WIDEN_SKX-LABEL: fptoui_4f32_to_4i64: -; WIDEN_SKX: # %bb.0: -; WIDEN_SKX-NEXT: vcvttps2uqq %xmm0, %ymm0 -; WIDEN_SKX-NEXT: retq -; -; WIDEN_KNL-LABEL: fptoui_4f32_to_4i64: -; WIDEN_KNL: # %bb.0: -; WIDEN_KNL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; WIDEN_KNL-NEXT: vcvttss2usi %xmm1, %rax -; WIDEN_KNL-NEXT: vmovq %rax, %xmm1 -; WIDEN_KNL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; WIDEN_KNL-NEXT: vcvttss2usi %xmm2, %rax -; WIDEN_KNL-NEXT: vmovq %rax, %xmm2 -; WIDEN_KNL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; WIDEN_KNL-NEXT: vcvttss2usi %xmm0, %rax -; WIDEN_KNL-NEXT: vmovq %rax, %xmm2 -; WIDEN_KNL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; WIDEN_KNL-NEXT: vcvttss2usi %xmm0, %rax -; WIDEN_KNL-NEXT: vmovq %rax, %xmm0 -; WIDEN_KNL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; WIDEN_KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; WIDEN_KNL-NEXT: retq %shuf = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> %cvt = fptoui <4 x float> %shuf to <4 x i64> ret <4 x i64> %cvt @@ -2349,30 +2006,6 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) { ; AVX512VLDQ-NEXT: vcvttps2uqq %ymm0, %zmm0 ; AVX512VLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VLDQ-NEXT: retq -; -; WIDEN_SKX-LABEL: fptoui_8f32_to_4i64: -; WIDEN_SKX: # %bb.0: -; WIDEN_SKX-NEXT: vcvttps2uqq %ymm0, %zmm0 -; WIDEN_SKX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; WIDEN_SKX-NEXT: retq -; -; WIDEN_KNL-LABEL: fptoui_8f32_to_4i64: -; WIDEN_KNL: # %bb.0: -; WIDEN_KNL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; WIDEN_KNL-NEXT: vcvttss2usi %xmm1, %rax -; WIDEN_KNL-NEXT: vcvttss2usi %xmm0, %rcx -; WIDEN_KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; WIDEN_KNL-NEXT: vcvttss2usi %xmm1, %rdx -; WIDEN_KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; WIDEN_KNL-NEXT: vcvttss2usi %xmm0, %rsi -; WIDEN_KNL-NEXT: vmovq %rsi, %xmm0 -; WIDEN_KNL-NEXT: vmovq %rdx, %xmm1 -; WIDEN_KNL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; WIDEN_KNL-NEXT: vmovq %rcx, %xmm1 -; WIDEN_KNL-NEXT: vmovq %rax, %xmm2 -; WIDEN_KNL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; WIDEN_KNL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; WIDEN_KNL-NEXT: retq %cvt = fptoui <8 x float> %a to <8 x i64> %shuf = shufflevector <8 x i64> %cvt, <8 x i64> undef, <4 x i32> ret <4 x i64> %shuf @@ -2392,11 +2025,6 @@ define <2 x i64> @fptosi_2f64_to_2i64_const() { ; AVX: # %bb.0: ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,18446744073709551615] ; AVX-NEXT: retq -; -; WIDEN-LABEL: fptosi_2f64_to_2i64_const: -; WIDEN: # %bb.0: -; WIDEN-NEXT: vmovaps {{.*#+}} xmm0 = [1,18446744073709551615] -; WIDEN-NEXT: retq %cvt = fptosi <2 x double> to <2 x i64> ret <2 x i64> %cvt } @@ -2411,11 +2039,6 @@ define <4 x i32> @fptosi_2f64_to_2i32_const() { ; AVX: # %bb.0: ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <4294967295,1,u,u> ; AVX-NEXT: retq -; -; WIDEN-LABEL: fptosi_2f64_to_2i32_const: -; WIDEN: # %bb.0: -; WIDEN-NEXT: vmovaps {{.*#+}} xmm0 = <4294967295,1,u,u> -; WIDEN-NEXT: retq %cvt = fptosi <2 x double> to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> ret <4 x i32> %ext @@ -2432,11 +2055,6 @@ define <4 x i64> @fptosi_4f64_to_4i64_const() { ; AVX: # %bb.0: ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,18446744073709551613] ; AVX-NEXT: retq -; -; WIDEN-LABEL: fptosi_4f64_to_4i64_const: -; WIDEN: # %bb.0: -; WIDEN-NEXT: vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,18446744073709551613] -; WIDEN-NEXT: retq %cvt = fptosi <4 x double> to <4 x i64> ret <4 x i64> %cvt } @@ -2451,11 +2069,6 @@ define <4 x i32> @fptosi_4f64_to_4i32_const() { ; AVX: # %bb.0: ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3] ; AVX-NEXT: retq -; -; WIDEN-LABEL: fptosi_4f64_to_4i32_const: -; WIDEN: # %bb.0: -; WIDEN-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3] -; WIDEN-NEXT: retq %cvt = fptosi <4 x double> to <4 x i32> ret <4 x i32> %cvt } @@ -2470,11 +2083,6 @@ define <2 x i64> @fptoui_2f64_to_2i64_const() { ; AVX: # %bb.0: ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [2,4] ; AVX-NEXT: retq -; -; WIDEN-LABEL: fptoui_2f64_to_2i64_const: -; WIDEN: # %bb.0: -; WIDEN-NEXT: vmovaps {{.*#+}} xmm0 = [2,4] -; WIDEN-NEXT: retq %cvt = fptoui <2 x double> to <2 x i64> ret <2 x i64> %cvt } @@ -2489,11 +2097,6 @@ define <4 x i32> @fptoui_2f64_to_2i32_const(<2 x double> %a) { ; AVX: # %bb.0: ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <2,4,u,u> ; AVX-NEXT: retq -; -; WIDEN-LABEL: fptoui_2f64_to_2i32_const: -; WIDEN: # %bb.0: -; WIDEN-NEXT: vmovaps {{.*#+}} xmm0 = <2,4,u,u> -; WIDEN-NEXT: retq %cvt = fptoui <2 x double> to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> ret <4 x i32> %ext @@ -2510,11 +2113,6 @@ define <4 x i64> @fptoui_4f64_to_4i64_const(<4 x double> %a) { ; AVX: # %bb.0: ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [2,4,6,8] ; AVX-NEXT: retq -; -; WIDEN-LABEL: fptoui_4f64_to_4i64_const: -; WIDEN: # %bb.0: -; WIDEN-NEXT: vmovaps {{.*#+}} ymm0 = [2,4,6,8] -; WIDEN-NEXT: retq %cvt = fptoui <4 x double> to <4 x i64> ret <4 x i64> %cvt } @@ -2529,11 +2127,6 @@ define <4 x i32> @fptoui_4f64_to_4i32_const(<4 x double> %a) { ; AVX: # %bb.0: ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [2,4,6,8] ; AVX-NEXT: retq -; -; WIDEN-LABEL: fptoui_4f64_to_4i32_const: -; WIDEN: # %bb.0: -; WIDEN-NEXT: vmovaps {{.*#+}} xmm0 = [2,4,6,8] -; WIDEN-NEXT: retq %cvt = fptoui <4 x double> to <4 x i32> ret <4 x i32> %cvt } @@ -2548,11 +2141,6 @@ define <4 x i32> @fptosi_4f32_to_4i32_const() { ; AVX: # %bb.0: ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,4294967295,2,3] ; AVX-NEXT: retq -; -; WIDEN-LABEL: fptosi_4f32_to_4i32_const: -; WIDEN: # %bb.0: -; WIDEN-NEXT: vmovaps {{.*#+}} xmm0 = [1,4294967295,2,3] -; WIDEN-NEXT: retq %cvt = fptosi <4 x float> to <4 x i32> ret <4 x i32> %cvt } @@ -2568,11 +2156,6 @@ define <4 x i64> @fptosi_4f32_to_4i64_const() { ; AVX: # %bb.0: ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,3] ; AVX-NEXT: retq -; -; WIDEN-LABEL: fptosi_4f32_to_4i64_const: -; WIDEN: # %bb.0: -; WIDEN-NEXT: vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,3] -; WIDEN-NEXT: retq %cvt = fptosi <4 x float> to <4 x i64> ret <4 x i64> %cvt } @@ -2588,11 +2171,6 @@ define <8 x i32> @fptosi_8f32_to_8i32_const(<8 x float> %a) { ; AVX: # %bb.0: ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,4294967295,2,3,6,4294967288,2,4294967295] ; AVX-NEXT: retq -; -; WIDEN-LABEL: fptosi_8f32_to_8i32_const: -; WIDEN: # %bb.0: -; WIDEN-NEXT: vmovaps {{.*#+}} ymm0 = [1,4294967295,2,3,6,4294967288,2,4294967295] -; WIDEN-NEXT: retq %cvt = fptosi <8 x float> to <8 x i32> ret <8 x i32> %cvt } @@ -2607,11 +2185,6 @@ define <4 x i32> @fptoui_4f32_to_4i32_const(<4 x float> %a) { ; AVX: # %bb.0: ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,2,4,6] ; AVX-NEXT: retq -; -; WIDEN-LABEL: fptoui_4f32_to_4i32_const: -; WIDEN: # %bb.0: -; WIDEN-NEXT: vmovaps {{.*#+}} xmm0 = [1,2,4,6] -; WIDEN-NEXT: retq %cvt = fptoui <4 x float> to <4 x i32> ret <4 x i32> %cvt } @@ -2627,11 +2200,6 @@ define <4 x i64> @fptoui_4f32_to_4i64_const() { ; AVX: # %bb.0: ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8] ; AVX-NEXT: retq -; -; WIDEN-LABEL: fptoui_4f32_to_4i64_const: -; WIDEN: # %bb.0: -; WIDEN-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8] -; WIDEN-NEXT: retq %cvt = fptoui <4 x float> to <4 x i64> ret <4 x i64> %cvt } @@ -2647,11 +2215,6 @@ define <8 x i32> @fptoui_8f32_to_8i32_const(<8 x float> %a) { ; AVX: # %bb.0: ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,6,8,6,4,1] ; AVX-NEXT: retq -; -; WIDEN-LABEL: fptoui_8f32_to_8i32_const: -; WIDEN: # %bb.0: -; WIDEN-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,6,8,6,4,1] -; WIDEN-NEXT: retq %cvt = fptoui <8 x float> to <8 x i32> ret <8 x i32> %cvt } @@ -2719,20 +2282,6 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind { ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; AVX512-NEXT: retq -; -; WIDEN-LABEL: fptosi_2f16_to_4i32: -; WIDEN: # %bb.0: -; WIDEN-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; WIDEN-NEXT: vcvtph2ps %xmm1, %xmm1 -; WIDEN-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; WIDEN-NEXT: vcvtph2ps %xmm0, %xmm0 -; WIDEN-NEXT: vcvttss2si %xmm0, %eax -; WIDEN-NEXT: vcvttss2si %xmm1, %ecx -; WIDEN-NEXT: vmovd %ecx, %xmm0 -; WIDEN-NEXT: vmovd %eax, %xmm1 -; WIDEN-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; WIDEN-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; WIDEN-NEXT: retq %cvt = fptosi <2 x half> %a to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %ext @@ -2775,18 +2324,6 @@ define <4 x i32> @fptosi_2f80_to_4i32(<2 x x86_fp80> %a) nounwind { ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; AVX-NEXT: retq -; -; WIDEN-LABEL: fptosi_2f80_to_4i32: -; WIDEN: # %bb.0: -; WIDEN-NEXT: fldt {{[0-9]+}}(%rsp) -; WIDEN-NEXT: fldt {{[0-9]+}}(%rsp) -; WIDEN-NEXT: fisttpl -{{[0-9]+}}(%rsp) -; WIDEN-NEXT: fisttpl -{{[0-9]+}}(%rsp) -; WIDEN-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; WIDEN-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; WIDEN-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; WIDEN-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; WIDEN-NEXT: retq %cvt = fptosi <2 x x86_fp80> %a to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %ext @@ -2841,27 +2378,6 @@ define <4 x i32> @fptosi_2f128_to_4i32(<2 x fp128> %a) nounwind { ; AVX-NEXT: popq %rbx ; AVX-NEXT: popq %r14 ; AVX-NEXT: retq -; -; WIDEN-LABEL: fptosi_2f128_to_4i32: -; WIDEN: # %bb.0: -; WIDEN-NEXT: pushq %rbp -; WIDEN-NEXT: pushq %r14 -; WIDEN-NEXT: pushq %rbx -; WIDEN-NEXT: movq %rcx, %r14 -; WIDEN-NEXT: movq %rdx, %rbx -; WIDEN-NEXT: callq __fixtfsi -; WIDEN-NEXT: movl %eax, %ebp -; WIDEN-NEXT: movq %rbx, %rdi -; WIDEN-NEXT: movq %r14, %rsi -; WIDEN-NEXT: callq __fixtfsi -; WIDEN-NEXT: vmovd %eax, %xmm0 -; WIDEN-NEXT: vmovd %ebp, %xmm1 -; WIDEN-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; WIDEN-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; WIDEN-NEXT: popq %rbx -; WIDEN-NEXT: popq %r14 -; WIDEN-NEXT: popq %rbp -; WIDEN-NEXT: retq %cvt = fptosi <2 x fp128> %a to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %ext @@ -2906,14 +2422,6 @@ define <2 x i8> @fptosi_2f32_to_2i8(<2 x float> %a) { ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %xmm0 ; AVX512VLDQ-NEXT: retq -; -; WIDEN-LABEL: fptosi_2f32_to_2i8: -; WIDEN: # %bb.0: -; WIDEN-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; WIDEN-NEXT: vcvttps2dq %zmm0, %zmm0 -; WIDEN-NEXT: vpmovdb %zmm0, %xmm0 -; WIDEN-NEXT: vzeroupper -; WIDEN-NEXT: retq %cvt = fptosi <2 x float> %a to <2 x i8> ret <2 x i8> %cvt } @@ -2957,23 +2465,6 @@ define <2 x i16> @fptosi_2f32_to_2i16(<2 x float> %a) { ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %xmm0 ; AVX512VLDQ-NEXT: retq -; -; WIDEN_SKX-LABEL: fptosi_2f32_to_2i16: -; WIDEN_SKX: # %bb.0: -; WIDEN_SKX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; WIDEN_SKX-NEXT: vcvttps2dq %ymm0, %ymm0 -; WIDEN_SKX-NEXT: vpmovdw %ymm0, %xmm0 -; WIDEN_SKX-NEXT: vzeroupper -; WIDEN_SKX-NEXT: retq -; -; WIDEN_KNL-LABEL: fptosi_2f32_to_2i16: -; WIDEN_KNL: # %bb.0: -; WIDEN_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; WIDEN_KNL-NEXT: vcvttps2dq %ymm0, %ymm0 -; WIDEN_KNL-NEXT: vpmovdw %zmm0, %ymm0 -; WIDEN_KNL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; WIDEN_KNL-NEXT: vzeroupper -; WIDEN_KNL-NEXT: retq %cvt = fptosi <2 x float> %a to <2 x i16> ret <2 x i16> %cvt } @@ -3016,14 +2507,6 @@ define <2 x i8> @fptoui_2f32_to_2i8(<2 x float> %a) { ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %xmm0 ; AVX512VLDQ-NEXT: retq -; -; WIDEN-LABEL: fptoui_2f32_to_2i8: -; WIDEN: # %bb.0: -; WIDEN-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; WIDEN-NEXT: vcvttps2dq %zmm0, %zmm0 -; WIDEN-NEXT: vpmovdb %zmm0, %xmm0 -; WIDEN-NEXT: vzeroupper -; WIDEN-NEXT: retq %cvt = fptoui <2 x float> %a to <2 x i8> ret <2 x i8> %cvt } @@ -3066,23 +2549,6 @@ define <2 x i16> @fptoui_2f32_to_2i16(<2 x float> %a) { ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %xmm0 ; AVX512VLDQ-NEXT: retq -; -; WIDEN_SKX-LABEL: fptoui_2f32_to_2i16: -; WIDEN_SKX: # %bb.0: -; WIDEN_SKX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; WIDEN_SKX-NEXT: vcvttps2dq %ymm0, %ymm0 -; WIDEN_SKX-NEXT: vpmovdw %ymm0, %xmm0 -; WIDEN_SKX-NEXT: vzeroupper -; WIDEN_SKX-NEXT: retq -; -; WIDEN_KNL-LABEL: fptoui_2f32_to_2i16: -; WIDEN_KNL: # %bb.0: -; WIDEN_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; WIDEN_KNL-NEXT: vcvttps2dq %ymm0, %ymm0 -; WIDEN_KNL-NEXT: vpmovdw %zmm0, %ymm0 -; WIDEN_KNL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; WIDEN_KNL-NEXT: vzeroupper -; WIDEN_KNL-NEXT: retq %cvt = fptoui <2 x float> %a to <2 x i16> ret <2 x i16> %cvt } @@ -3126,15 +2592,6 @@ define <2 x i8> @fptosi_2f64_to_2i8(<2 x double> %a) { ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttpd2qq %xmm0, %xmm0 ; AVX512VLDQ-NEXT: retq -; -; WIDEN-LABEL: fptosi_2f64_to_2i8: -; WIDEN: # %bb.0: -; WIDEN-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; WIDEN-NEXT: vcvttsd2si %xmm1, %eax -; WIDEN-NEXT: vcvttsd2si %xmm0, %ecx -; WIDEN-NEXT: vmovd %ecx, %xmm0 -; WIDEN-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 -; WIDEN-NEXT: retq %cvt = fptosi <2 x double> %a to <2 x i8> ret <2 x i8> %cvt } @@ -3178,23 +2635,6 @@ define <2 x i16> @fptosi_2f64_to_2i16(<2 x double> %a) { ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttpd2qq %xmm0, %xmm0 ; AVX512VLDQ-NEXT: retq -; -; WIDEN_SKX-LABEL: fptosi_2f64_to_2i16: -; WIDEN_SKX: # %bb.0: -; WIDEN_SKX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; WIDEN_SKX-NEXT: vcvttpd2dq %zmm0, %ymm0 -; WIDEN_SKX-NEXT: vpmovdw %ymm0, %xmm0 -; WIDEN_SKX-NEXT: vzeroupper -; WIDEN_SKX-NEXT: retq -; -; WIDEN_KNL-LABEL: fptosi_2f64_to_2i16: -; WIDEN_KNL: # %bb.0: -; WIDEN_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; WIDEN_KNL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; WIDEN_KNL-NEXT: vpmovdw %zmm0, %ymm0 -; WIDEN_KNL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; WIDEN_KNL-NEXT: vzeroupper -; WIDEN_KNL-NEXT: retq %cvt = fptosi <2 x double> %a to <2 x i16> ret <2 x i16> %cvt } @@ -3237,15 +2677,6 @@ define <2 x i8> @fptoui_2f64_to_2i8(<2 x double> %a) { ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttpd2uqq %xmm0, %xmm0 ; AVX512VLDQ-NEXT: retq -; -; WIDEN-LABEL: fptoui_2f64_to_2i8: -; WIDEN: # %bb.0: -; WIDEN-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; WIDEN-NEXT: vcvttsd2si %xmm1, %eax -; WIDEN-NEXT: vcvttsd2si %xmm0, %ecx -; WIDEN-NEXT: vmovd %ecx, %xmm0 -; WIDEN-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 -; WIDEN-NEXT: retq %cvt = fptoui <2 x double> %a to <2 x i8> ret <2 x i8> %cvt } @@ -3288,23 +2719,6 @@ define <2 x i16> @fptoui_2f64_to_2i16(<2 x double> %a) { ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttpd2uqq %xmm0, %xmm0 ; AVX512VLDQ-NEXT: retq -; -; WIDEN_SKX-LABEL: fptoui_2f64_to_2i16: -; WIDEN_SKX: # %bb.0: -; WIDEN_SKX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; WIDEN_SKX-NEXT: vcvttpd2dq %zmm0, %ymm0 -; WIDEN_SKX-NEXT: vpmovdw %ymm0, %xmm0 -; WIDEN_SKX-NEXT: vzeroupper -; WIDEN_SKX-NEXT: retq -; -; WIDEN_KNL-LABEL: fptoui_2f64_to_2i16: -; WIDEN_KNL: # %bb.0: -; WIDEN_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; WIDEN_KNL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; WIDEN_KNL-NEXT: vpmovdw %zmm0, %ymm0 -; WIDEN_KNL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; WIDEN_KNL-NEXT: vzeroupper -; WIDEN_KNL-NEXT: retq %cvt = fptoui <2 x double> %a to <2 x i16> ret <2 x i16> %cvt } diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp-widen.ll b/llvm/test/CodeGen/X86/vec_int_to_fp-widen.ll new file mode 100644 index 000000000000..97a5e5a96ac1 --- /dev/null +++ b/llvm/test/CodeGen/X86/vec_int_to_fp-widen.ll @@ -0,0 +1,5565 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX,VEX,AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,VEX,AVX2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512F +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512VL +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512DQ +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512VLDQ +; +; 32-bit tests to make sure we're not doing anything stupid. +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse4.1 + +; +; Signed Integer to Double +; + +define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) { +; SSE2-LABEL: sitofp_2i64_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: cvtsi2sdq %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2sdq %rax, %xmm0 +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_2i64_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2sdq %rax, %xmm1 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2sdq %rax, %xmm0 +; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: retq +; +; VEX-LABEL: sitofp_2i64_to_2f64: +; VEX: # %bb.0: +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1 +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0 +; VEX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; VEX-NEXT: retq +; +; AVX512F-LABEL: sitofp_2i64_to_2f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0 +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: sitofp_2i64_to_2f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0 +; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: sitofp_2i64_to_2f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: sitofp_2i64_to_2f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtqq2pd %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %cvt = sitofp <2 x i64> %a to <2 x double> + ret <2 x double> %cvt +} + +define <2 x double> @sitofp_2i32_to_2f64(<4 x i32> %a) { +; SSE-LABEL: sitofp_2i32_to_2f64: +; SSE: # %bb.0: +; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_2i32_to_2f64: +; AVX: # %bb.0: +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> + %cvt = sitofp <2 x i32> %shuf to <2 x double> + ret <2 x double> %cvt +} + +define <2 x double> @sitofp_4i32_to_2f64(<4 x i32> %a) { +; SSE-LABEL: sitofp_4i32_to_2f64: +; SSE: # %bb.0: +; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_4i32_to_2f64: +; AVX: # %bb.0: +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %cvt = sitofp <4 x i32> %a to <4 x double> + %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> + ret <2 x double> %shuf +} + +define <2 x double> @sitofp_2i16_to_2f64(<8 x i16> %a) { +; SSE2-LABEL: sitofp_2i16_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_2i16_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sitofp_2i16_to_2f64: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> + %cvt = sitofp <2 x i16> %shuf to <2 x double> + ret <2 x double> %cvt +} + +define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) { +; SSE2-LABEL: sitofp_8i16_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_8i16_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; VEX-LABEL: sitofp_8i16_to_2f64: +; VEX: # %bb.0: +; VEX-NEXT: vpmovsxwd %xmm0, %xmm0 +; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VEX-NEXT: retq +; +; AVX512-LABEL: sitofp_8i16_to_2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %cvt = sitofp <8 x i16> %a to <8 x double> + %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> + ret <2 x double> %shuf +} + +define <2 x double> @sitofp_2i8_to_2f64(<16 x i8> %a) { +; SSE2-LABEL: sitofp_2i8_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_2i8_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sitofp_2i8_to_2f64: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> + %cvt = sitofp <2 x i8> %shuf to <2 x double> + ret <2 x double> %cvt +} + +define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) { +; SSE2-LABEL: sitofp_16i8_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_16i8_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; VEX-LABEL: sitofp_16i8_to_2f64: +; VEX: # %bb.0: +; VEX-NEXT: vpmovsxbd %xmm0, %xmm0 +; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VEX-NEXT: retq +; +; AVX512-LABEL: sitofp_16i8_to_2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %cvt = sitofp <16 x i8> %a to <16 x double> + %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> + ret <2 x double> %shuf +} + +define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) { +; SSE2-LABEL: sitofp_4i64_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: cvtsi2sdq %rax, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2sdq %rax, %xmm0 +; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: cvtsi2sdq %rax, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2sdq %rax, %xmm0 +; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm3, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_4i64_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2sdq %rax, %xmm2 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2sdq %rax, %xmm0 +; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2sdq %rax, %xmm2 +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2sdq %rax, %xmm1 +; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE41-NEXT: retq +; +; AVX1-LABEL: sitofp_4i64_to_4f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sitofp_4i64_to_4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: sitofp_4i64_to_4f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax +; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vmovq %xmm1, %rax +; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: sitofp_4i64_to_4f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax +; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vmovq %xmm1, %rax +; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 +; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 +; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: sitofp_4i64_to_4f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: sitofp_4i64_to_4f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtqq2pd %ymm0, %ymm0 +; AVX512VLDQ-NEXT: retq + %cvt = sitofp <4 x i64> %a to <4 x double> + ret <4 x double> %cvt +} + +define <4 x double> @sitofp_4i32_to_4f64(<4 x i32> %a) { +; SSE-LABEL: sitofp_4i32_to_4f64: +; SSE: # %bb.0: +; SSE-NEXT: cvtdq2pd %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-NEXT: cvtdq2pd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_4i32_to_4f64: +; AVX: # %bb.0: +; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX-NEXT: retq + %cvt = sitofp <4 x i32> %a to <4 x double> + ret <4 x double> %cvt +} + +define <4 x double> @sitofp_4i16_to_4f64(<8 x i16> %a) { +; SSE2-LABEL: sitofp_4i16_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_4i16_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; AVX-LABEL: sitofp_4i16_to_4f64: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX-NEXT: retq + %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> + %cvt = sitofp <4 x i16> %shuf to <4 x double> + ret <4 x double> %cvt +} + +define <4 x double> @sitofp_8i16_to_4f64(<8 x i16> %a) { +; SSE2-LABEL: sitofp_8i16_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_8i16_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; VEX-LABEL: sitofp_8i16_to_4f64: +; VEX: # %bb.0: +; VEX-NEXT: vpmovsxwd %xmm0, %xmm0 +; VEX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; VEX-NEXT: retq +; +; AVX512-LABEL: sitofp_8i16_to_4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512-NEXT: retq + %cvt = sitofp <8 x i16> %a to <8 x double> + %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> + ret <4 x double> %shuf +} + +define <4 x double> @sitofp_4i8_to_4f64(<16 x i8> %a) { +; SSE2-LABEL: sitofp_4i8_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_4i8_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd %xmm0, %xmm1 +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; AVX-LABEL: sitofp_4i8_to_4f64: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX-NEXT: retq + %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> + %cvt = sitofp <4 x i8> %shuf to <4 x double> + ret <4 x double> %cvt +} + +define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) { +; SSE2-LABEL: sitofp_16i8_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_16i8_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd %xmm0, %xmm1 +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; VEX-LABEL: sitofp_16i8_to_4f64: +; VEX: # %bb.0: +; VEX-NEXT: vpmovsxbd %xmm0, %xmm0 +; VEX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; VEX-NEXT: retq +; +; AVX512-LABEL: sitofp_16i8_to_4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512-NEXT: retq + %cvt = sitofp <16 x i8> %a to <16 x double> + %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> + ret <4 x double> %shuf +} + +; +; Unsigned Integer to Double +; + +define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) { +; SSE2-LABEL: uitofp_2i64_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: por {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psrlq $32, %xmm0 +; SSE2-NEXT: por {{.*}}(%rip), %xmm0 +; SSE2-NEXT: subpd {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_2i64_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; SSE41-NEXT: por {{.*}}(%rip), %xmm1 +; SSE41-NEXT: psrlq $32, %xmm0 +; SSE41-NEXT: por {{.*}}(%rip), %xmm0 +; SSE41-NEXT: subpd {{.*}}(%rip), %xmm0 +; SSE41-NEXT: addpd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_2i64_to_2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_2i64_to_2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: uitofp_2i64_to_2f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_2i64_to_2f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 +; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_2i64_to_2f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_2i64_to_2f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtuqq2pd %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %cvt = uitofp <2 x i64> %a to <2 x double> + ret <2 x double> %cvt +} + +define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) { +; SSE2-LABEL: uitofp_2i32_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_2i32_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE41-NEXT: addpd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; VEX-LABEL: uitofp_2i32_to_2f64: +; VEX: # %bb.0: +; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1 +; VEX-NEXT: vpsrld $16, %xmm0, %xmm0 +; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0 +; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; VEX-NEXT: retq +; +; AVX512F-LABEL: uitofp_2i32_to_2f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_2i32_to_2f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_2i32_to_2f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_2i32_to_2f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> + %cvt = uitofp <2 x i32> %shuf to <2 x double> + ret <2 x double> %cvt +} + +define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) { +; SSE2-LABEL: uitofp_4i32_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_4i32_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE41-NEXT: addpd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; VEX-LABEL: uitofp_4i32_to_2f64: +; VEX: # %bb.0: +; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; VEX-NEXT: vpsrld $16, %xmm0, %xmm0 +; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1 +; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0 +; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; VEX-NEXT: retq +; +; AVX512F-LABEL: uitofp_4i32_to_2f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_4i32_to_2f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtudq2pd %xmm0, %ymm0 +; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_4i32_to_2f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_4i32_to_2f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %ymm0 +; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq + %cvt = uitofp <4 x i32> %a to <4 x double> + %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> + ret <2 x double> %shuf +} + +define <2 x double> @uitofp_2i16_to_2f64(<8 x i16> %a) { +; SSE2-LABEL: uitofp_2i16_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_2i16_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: uitofp_2i16_to_2f64: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> + %cvt = uitofp <2 x i16> %shuf to <2 x double> + ret <2 x double> %cvt +} + +define <2 x double> @uitofp_8i16_to_2f64(<8 x i16> %a) { +; SSE2-LABEL: uitofp_8i16_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_8i16_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; VEX-LABEL: uitofp_8i16_to_2f64: +; VEX: # %bb.0: +; VEX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VEX-NEXT: retq +; +; AVX512-LABEL: uitofp_8i16_to_2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %cvt = uitofp <8 x i16> %a to <8 x double> + %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> + ret <2 x double> %shuf +} + +define <2 x double> @uitofp_2i8_to_2f64(<16 x i8> %a) { +; SSE2-LABEL: uitofp_2i8_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_2i8_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: uitofp_2i8_to_2f64: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> + %cvt = uitofp <2 x i8> %shuf to <2 x double> + ret <2 x double> %cvt +} + +define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) { +; SSE2-LABEL: uitofp_16i8_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_16i8_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; VEX-LABEL: uitofp_16i8_to_2f64: +; VEX: # %bb.0: +; VEX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VEX-NEXT: retq +; +; AVX512-LABEL: uitofp_16i8_to_2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %cvt = uitofp <16 x i8> %a to <16 x double> + %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> + ret <2 x double> %shuf +} + +define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) { +; SSE2-LABEL: uitofp_4i64_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: psrlq $32, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; SSE2-NEXT: subpd %xmm6, %xmm0 +; SSE2-NEXT: addpd %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: psrlq $32, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: subpd %xmm6, %xmm1 +; SSE2-NEXT: addpd %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_4i64_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] +; SSE41-NEXT: por %xmm4, %xmm3 +; SSE41-NEXT: psrlq $32, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; SSE41-NEXT: subpd %xmm6, %xmm0 +; SSE41-NEXT: addpd %xmm3, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE41-NEXT: por %xmm4, %xmm2 +; SSE41-NEXT: psrlq $32, %xmm1 +; SSE41-NEXT: por %xmm5, %xmm1 +; SSE41-NEXT: subpd %xmm6, %xmm1 +; SSE41-NEXT: addpd %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_4i64_to_4f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vorpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vsubpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_4i64_to_4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] +; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] +; AVX2-NEXT: vsubpd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: uitofp_4i64_to_4f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] +; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] +; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] +; AVX512F-NEXT: vsubpd %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_4i64_to_4f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm1 +; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512VL-NEXT: vsubpd {{.*}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512VL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_4i64_to_4f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_4i64_to_4f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtuqq2pd %ymm0, %ymm0 +; AVX512VLDQ-NEXT: retq + %cvt = uitofp <4 x i64> %a to <4 x double> + ret <4 x double> %cvt +} + +define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) { +; SSE2-LABEL: uitofp_4i32_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4] +; SSE2-NEXT: mulpd %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm4 +; SSE2-NEXT: mulpd %xmm2, %xmm4 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE2-NEXT: cvtdq2pd %xmm3, %xmm1 +; SSE2-NEXT: addpd %xmm4, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_4i32_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4] +; SSE41-NEXT: mulpd %xmm2, %xmm1 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7] +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: addpd %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3,4,5,6,7] +; SSE41-NEXT: psrld $16, %xmm4 +; SSE41-NEXT: cvtdq2pd %xmm4, %xmm4 +; SSE41-NEXT: mulpd %xmm2, %xmm4 +; SSE41-NEXT: cvtdq2pd %xmm3, %xmm1 +; SSE41-NEXT: addpd %xmm4, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_4i32_to_4f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_4i32_to_4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4] +; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: uitofp_4i32_to_4f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_4i32_to_4f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtudq2pd %xmm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_4i32_to_4f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_4i32_to_4f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %ymm0 +; AVX512VLDQ-NEXT: retq + %cvt = uitofp <4 x i32> %a to <4 x double> + ret <4 x double> %cvt +} + +define <4 x double> @uitofp_4i16_to_4f64(<8 x i16> %a) { +; SSE2-LABEL: uitofp_4i16_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_4i16_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; AVX-LABEL: uitofp_4i16_to_4f64: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX-NEXT: retq + %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> + %cvt = uitofp <4 x i16> %shuf to <4 x double> + ret <4 x double> %cvt +} + +define <4 x double> @uitofp_8i16_to_4f64(<8 x i16> %a) { +; SSE2-LABEL: uitofp_8i16_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_8i16_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; VEX-LABEL: uitofp_8i16_to_4f64: +; VEX: # %bb.0: +; VEX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; VEX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; VEX-NEXT: retq +; +; AVX512-LABEL: uitofp_8i16_to_4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512-NEXT: retq + %cvt = uitofp <8 x i16> %a to <8 x double> + %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> + ret <4 x double> %shuf +} + +define <4 x double> @uitofp_4i8_to_4f64(<16 x i8> %a) { +; SSE2-LABEL: uitofp_4i8_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_4i8_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; AVX-LABEL: uitofp_4i8_to_4f64: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX-NEXT: retq + %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> + %cvt = uitofp <4 x i8> %shuf to <4 x double> + ret <4 x double> %cvt +} + +define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) { +; SSE2-LABEL: uitofp_16i8_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_16i8_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; VEX-LABEL: uitofp_16i8_to_4f64: +; VEX: # %bb.0: +; VEX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; VEX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; VEX-NEXT: retq +; +; AVX512-LABEL: uitofp_16i8_to_4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512-NEXT: retq + %cvt = uitofp <16 x i8> %a to <16 x double> + %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> + ret <4 x double> %shuf +} + +; +; Signed Integer to Float +; + +define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) { +; SSE2-LABEL: sitofp_2i64_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_2i64_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; SSE41-NEXT: retq +; +; VEX-LABEL: sitofp_2i64_to_4f32: +; VEX: # %bb.0: +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 +; VEX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; VEX-NEXT: retq +; +; AVX512F-LABEL: sitofp_2i64_to_4f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 +; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: sitofp_2i64_to_4f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 +; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: sitofp_2i64_to_4f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: sitofp_2i64_to_4f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtqq2ps %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %cvt = sitofp <2 x i64> %a to <2 x float> + %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> + ret <4 x float> %ext +} + +define <4 x float> @sitofp_2i64_to_4f32_zero(<2 x i64> %a) { +; SSE2-LABEL: sitofp_2i64_to_4f32_zero: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_2i64_to_4f32_zero: +; SSE41: # %bb.0: +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],zero,zero +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; VEX-LABEL: sitofp_2i64_to_4f32_zero: +; VEX: # %bb.0: +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero +; VEX-NEXT: retq +; +; AVX512F-LABEL: sitofp_2i64_to_4f32_zero: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: sitofp_2i64_to_4f32_zero: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: sitofp_2i64_to_4f32_zero: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: sitofp_2i64_to_4f32_zero: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtqq2ps %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %cvt = sitofp <2 x i64> %a to <2 x float> + %ext = shufflevector <2 x float> %cvt, <2 x float> zeroinitializer, <4 x i32> + ret <4 x float> %ext +} + +define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) { +; SSE2-LABEL: sitofp_4i64_to_4f32_undef: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,0] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_4i64_to_4f32_undef: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; SSE41-NEXT: retq +; +; VEX-LABEL: sitofp_4i64_to_4f32_undef: +; VEX: # %bb.0: +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 +; VEX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; VEX-NEXT: retq +; +; AVX512F-LABEL: sitofp_4i64_to_4f32_undef: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 +; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: sitofp_4i64_to_4f32_undef: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 +; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: sitofp_4i64_to_4f32_undef: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: sitofp_4i64_to_4f32_undef: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq + %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> + %cvt = sitofp <4 x i64> %ext to <4 x float> + ret <4 x float> %cvt +} + +define <4 x float> @sitofp_4i32_to_4f32(<4 x i32> %a) { +; SSE-LABEL: sitofp_4i32_to_4f32: +; SSE: # %bb.0: +; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_4i32_to_4f32: +; AVX: # %bb.0: +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: retq + %cvt = sitofp <4 x i32> %a to <4 x float> + ret <4 x float> %cvt +} + +define <4 x float> @sitofp_4i16_to_4f32(<8 x i16> %a) { +; SSE2-LABEL: sitofp_4i16_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_4i16_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sitofp_4i16_to_4f32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: retq + %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> + %cvt = sitofp <4 x i16> %shuf to <4 x float> + ret <4 x float> %cvt +} + +define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) { +; SSE2-LABEL: sitofp_8i16_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_8i16_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sitofp_8i16_to_4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: sitofp_8i16_to_4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: sitofp_8i16_to_4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %cvt = sitofp <8 x i16> %a to <8 x float> + %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> + ret <4 x float> %shuf +} + +define <4 x float> @sitofp_4i8_to_4f32(<16 x i8> %a) { +; SSE2-LABEL: sitofp_4i8_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_4i8_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sitofp_4i8_to_4f32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: retq + %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> + %cvt = sitofp <4 x i8> %shuf to <4 x float> + ret <4 x float> %cvt +} + +define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) { +; SSE2-LABEL: sitofp_16i8_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_16i8_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sitofp_16i8_to_4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: sitofp_16i8_to_4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: sitofp_16i8_to_4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %cvt = sitofp <16 x i8> %a to <16 x float> + %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> + ret <4 x float> %shuf +} + +define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) { +; SSE2-LABEL: sitofp_4i64_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_4i64_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; SSE41-NEXT: retq +; +; AVX1-LABEL: sitofp_4i64_to_4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: sitofp_4i64_to_4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: sitofp_4i64_to_4f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: sitofp_4i64_to_4f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: sitofp_4i64_to_4f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: sitofp_4i64_to_4f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq + %cvt = sitofp <4 x i64> %a to <4 x float> + ret <4 x float> %cvt +} + +define <8 x float> @sitofp_8i32_to_8f32(<8 x i32> %a) { +; SSE-LABEL: sitofp_8i32_to_8f32: +; SSE: # %bb.0: +; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_8i32_to_8f32: +; AVX: # %bb.0: +; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX-NEXT: retq + %cvt = sitofp <8 x i32> %a to <8 x float> + ret <8 x float> %cvt +} + +define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) { +; SSE2-LABEL: sitofp_8i16_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: cvtdq2ps %xmm1, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_8i16_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sitofp_8i16_to_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sitofp_8i16_to_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sitofp_8i16_to_8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX512-NEXT: retq + %cvt = sitofp <8 x i16> %a to <8 x float> + ret <8 x float> %cvt +} + +define <8 x float> @sitofp_8i8_to_8f32(<16 x i8> %a) { +; SSE2-LABEL: sitofp_8i8_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: cvtdq2ps %xmm1, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_8i8_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd %xmm0, %xmm1 +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sitofp_8i8_to_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sitofp_8i8_to_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sitofp_8i8_to_8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxbd %xmm0, %ymm0 +; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX512-NEXT: retq + %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> + %cvt = sitofp <8 x i8> %shuf to <8 x float> + ret <8 x float> %cvt +} + +define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) { +; SSE2-LABEL: sitofp_16i8_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: cvtdq2ps %xmm1, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_16i8_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd %xmm0, %xmm1 +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sitofp_16i8_to_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sitofp_16i8_to_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sitofp_16i8_to_8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512-NEXT: retq + %cvt = sitofp <16 x i8> %a to <16 x float> + %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> + ret <8 x float> %shuf +} + +; +; Unsigned Integer to Float +; + +define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) { +; SSE2-LABEL: uitofp_2i64_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB39_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: jmp .LBB39_3 +; SSE2-NEXT: .LBB39_1: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: addss %xmm0, %xmm0 +; SSE2-NEXT: .LBB39_3: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB39_4 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB39_4: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: addss %xmm1, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_2i64_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB39_1 +; SSE41-NEXT: # %bb.2: +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: jmp .LBB39_3 +; SSE41-NEXT: .LBB39_1: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: addss %xmm1, %xmm1 +; SSE41-NEXT: .LBB39_3: +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB39_4 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; SSE41-NEXT: retq +; SSE41-NEXT: .LBB39_4: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: addss %xmm0, %xmm0 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; SSE41-NEXT: retq +; +; VEX-LABEL: uitofp_2i64_to_4f32: +; VEX: # %bb.0: +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB39_1 +; VEX-NEXT: # %bb.2: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; VEX-NEXT: jmp .LBB39_3 +; VEX-NEXT: .LBB39_1: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; VEX-NEXT: .LBB39_3: +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB39_4 +; VEX-NEXT: # %bb.5: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 +; VEX-NEXT: jmp .LBB39_6 +; VEX-NEXT: .LBB39_4: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 +; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; VEX-NEXT: .LBB39_6: +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; VEX-NEXT: js .LBB39_8 +; VEX-NEXT: # %bb.7: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 +; VEX-NEXT: .LBB39_8: +; VEX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; VEX-NEXT: retq +; +; AVX512F-LABEL: uitofp_2i64_to_4f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1 +; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_2i64_to_4f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1 +; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_2i64_to_4f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_2i64_to_4f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtuqq2ps %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %cvt = uitofp <2 x i64> %a to <2 x float> + %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> + ret <4 x float> %ext +} + +define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) { +; SSE2-LABEL: uitofp_2i64_to_2f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB40_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: jmp .LBB40_3 +; SSE2-NEXT: .LBB40_1: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: addss %xmm1, %xmm1 +; SSE2-NEXT: .LBB40_3: +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB40_4 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: jmp .LBB40_6 +; SSE2-NEXT: .LBB40_4: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: addss %xmm0, %xmm0 +; SSE2-NEXT: .LBB40_6: +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_2i64_to_2f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB40_1 +; SSE41-NEXT: # %bb.2: +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: jmp .LBB40_3 +; SSE41-NEXT: .LBB40_1: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: addss %xmm0, %xmm0 +; SSE41-NEXT: .LBB40_3: +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB40_4 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; SSE41-NEXT: retq +; SSE41-NEXT: .LBB40_4: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: addss %xmm1, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; SSE41-NEXT: retq +; +; VEX-LABEL: uitofp_2i64_to_2f32: +; VEX: # %bb.0: +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB40_1 +; VEX-NEXT: # %bb.2: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; VEX-NEXT: jmp .LBB40_3 +; VEX-NEXT: .LBB40_1: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; VEX-NEXT: .LBB40_3: +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB40_4 +; VEX-NEXT: # %bb.5: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero +; VEX-NEXT: retq +; VEX-NEXT: .LBB40_4: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 +; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero +; VEX-NEXT: retq +; +; AVX512F-LABEL: uitofp_2i64_to_2f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_2i64_to_2f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_2i64_to_2f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_2i64_to_2f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtuqq2ps %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %cvt = uitofp <2 x i64> %a to <2 x float> + %ext = shufflevector <2 x float> %cvt, <2 x float> zeroinitializer, <4 x i32> + ret <4 x float> %ext +} + +define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { +; SSE2-LABEL: uitofp_4i64_to_4f32_undef: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB41_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: jmp .LBB41_3 +; SSE2-NEXT: .LBB41_1: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: addss %xmm0, %xmm0 +; SSE2-NEXT: .LBB41_3: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB41_4 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: jmp .LBB41_6 +; SSE2-NEXT: .LBB41_4: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: addss %xmm1, %xmm1 +; SSE2-NEXT: .LBB41_6: +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: js .LBB41_8 +; SSE2-NEXT: # %bb.7: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: .LBB41_8: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_4i64_to_4f32_undef: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB41_1 +; SSE41-NEXT: # %bb.2: +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: jmp .LBB41_3 +; SSE41-NEXT: .LBB41_1: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: addss %xmm1, %xmm1 +; SSE41-NEXT: .LBB41_3: +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB41_4 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: jmp .LBB41_6 +; SSE41-NEXT: .LBB41_4: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: addss %xmm0, %xmm0 +; SSE41-NEXT: .LBB41_6: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: js .LBB41_8 +; SSE41-NEXT: # %bb.7: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: .LBB41_8: +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; SSE41-NEXT: retq +; +; VEX-LABEL: uitofp_4i64_to_4f32_undef: +; VEX: # %bb.0: +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB41_1 +; VEX-NEXT: # %bb.2: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; VEX-NEXT: jmp .LBB41_3 +; VEX-NEXT: .LBB41_1: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; VEX-NEXT: .LBB41_3: +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB41_4 +; VEX-NEXT: # %bb.5: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 +; VEX-NEXT: jmp .LBB41_6 +; VEX-NEXT: .LBB41_4: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 +; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; VEX-NEXT: .LBB41_6: +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; VEX-NEXT: js .LBB41_8 +; VEX-NEXT: # %bb.7: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 +; VEX-NEXT: .LBB41_8: +; VEX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; VEX-NEXT: retq +; +; AVX512F-LABEL: uitofp_4i64_to_4f32_undef: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1 +; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_4i64_to_4f32_undef: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1 +; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_4i64_to_4f32_undef: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_4i64_to_4f32_undef: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VLDQ-NEXT: vcvtuqq2ps %ymm0, %xmm0 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq + %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> + %cvt = uitofp <4 x i64> %ext to <4 x float> + ret <4 x float> %cvt +} + +define <4 x float> @uitofp_4i32_to_4f32(<4 x i32> %a) { +; SSE2-LABEL: uitofp_4i32_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: por {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: por {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_4i32_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; SSE41-NEXT: addps {{.*}}(%rip), %xmm0 +; SSE41-NEXT: addps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_4i32_to_4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_4i32_to_4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200] +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1392508928,1392508928,1392508928,1392508928] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11] +; AVX2-NEXT: vaddps %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: uitofp_4i32_to_4f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_4i32_to_4f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_4i32_to_4f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_4i32_to_4f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtudq2ps %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq + %cvt = uitofp <4 x i32> %a to <4 x float> + ret <4 x float> %cvt +} + +define <4 x float> @uitofp_4i16_to_4f32(<8 x i16> %a) { +; SSE2-LABEL: uitofp_4i16_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_4i16_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: uitofp_4i16_to_4f32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: retq + %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> + %cvt = uitofp <4 x i16> %shuf to <4 x float> + ret <4 x float> %cvt +} + +define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) { +; SSE2-LABEL: uitofp_8i16_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_8i16_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_8i16_to_4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_8i16_to_4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: uitofp_8i16_to_4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %cvt = uitofp <8 x i16> %a to <8 x float> + %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> + ret <4 x float> %shuf +} + +define <4 x float> @uitofp_4i8_to_4f32(<16 x i8> %a) { +; SSE2-LABEL: uitofp_4i8_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_4i8_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: uitofp_4i8_to_4f32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: retq + %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> + %cvt = uitofp <4 x i8> %shuf to <4 x float> + ret <4 x float> %cvt +} + +define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) { +; SSE2-LABEL: uitofp_16i8_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_16i8_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_16i8_to_4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_16i8_to_4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: uitofp_16i8_to_4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %cvt = uitofp <16 x i8> %a to <16 x float> + %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> + ret <4 x float> %shuf +} + +define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { +; SSE2-LABEL: uitofp_4i64_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB47_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE2-NEXT: jmp .LBB47_3 +; SSE2-NEXT: .LBB47_1: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE2-NEXT: addss %xmm2, %xmm2 +; SSE2-NEXT: .LBB47_3: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB47_4 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: cvtsi2ssq %rax, %xmm3 +; SSE2-NEXT: jmp .LBB47_6 +; SSE2-NEXT: .LBB47_4: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: cvtsi2ssq %rax, %xmm3 +; SSE2-NEXT: addss %xmm3, %xmm3 +; SSE2-NEXT: .LBB47_6: +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB47_7 +; SSE2-NEXT: # %bb.8: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: jmp .LBB47_9 +; SSE2-NEXT: .LBB47_7: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: addss %xmm1, %xmm1 +; SSE2-NEXT: .LBB47_9: +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB47_10 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: jmp .LBB47_12 +; SSE2-NEXT: .LBB47_10: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: addss %xmm0, %xmm0 +; SSE2-NEXT: .LBB47_12: +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_4i64_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB47_1 +; SSE41-NEXT: # %bb.2: +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: jmp .LBB47_3 +; SSE41-NEXT: .LBB47_1: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: addss %xmm2, %xmm2 +; SSE41-NEXT: .LBB47_3: +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB47_4 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: jmp .LBB47_6 +; SSE41-NEXT: .LBB47_4: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: addss %xmm0, %xmm0 +; SSE41-NEXT: .LBB47_6: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB47_7 +; SSE41-NEXT: # %bb.8: +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: jmp .LBB47_9 +; SSE41-NEXT: .LBB47_7: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: addss %xmm2, %xmm2 +; SSE41-NEXT: .LBB47_9: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB47_10 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; SSE41-NEXT: retq +; SSE41-NEXT: .LBB47_10: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: addss %xmm1, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_4i64_to_4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB47_1 +; AVX1-NEXT: # %bb.2: +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX1-NEXT: jmp .LBB47_3 +; AVX1-NEXT: .LBB47_1: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: .LBB47_3: +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB47_4 +; AVX1-NEXT: # %bb.5: +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX1-NEXT: jmp .LBB47_6 +; AVX1-NEXT: .LBB47_4: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: .LBB47_6: +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB47_7 +; AVX1-NEXT: # %bb.8: +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX1-NEXT: jmp .LBB47_9 +; AVX1-NEXT: .LBB47_7: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: .LBB47_9: +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: js .LBB47_10 +; AVX1-NEXT: # %bb.11: +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB47_10: +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_4i64_to_4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: testq %rax, %rax +; AVX2-NEXT: js .LBB47_1 +; AVX2-NEXT: # %bb.2: +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX2-NEXT: jmp .LBB47_3 +; AVX2-NEXT: .LBB47_1: +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq %rcx +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: .LBB47_3: +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: testq %rax, %rax +; AVX2-NEXT: js .LBB47_4 +; AVX2-NEXT: # %bb.5: +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX2-NEXT: jmp .LBB47_6 +; AVX2-NEXT: .LBB47_4: +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq %rcx +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: .LBB47_6: +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: testq %rax, %rax +; AVX2-NEXT: js .LBB47_7 +; AVX2-NEXT: # %bb.8: +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX2-NEXT: jmp .LBB47_9 +; AVX2-NEXT: .LBB47_7: +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq %rcx +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: .LBB47_9: +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: testq %rax, %rax +; AVX2-NEXT: js .LBB47_10 +; AVX2-NEXT: # %bb.11: +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB47_10: +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq %rcx +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: uitofp_4i64_to_4f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_4i64_to_4f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_4i64_to_4f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_4i64_to_4f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtuqq2ps %ymm0, %xmm0 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq + %cvt = uitofp <4 x i64> %a to <4 x float> + ret <4 x float> %cvt +} + +define <8 x float> @uitofp_8i32_to_8f32(<8 x i32> %a) { +; SSE2-LABEL: uitofp_8i32_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200] +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928] +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movaps {{.*#+}} xmm6 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11] +; SSE2-NEXT: addps %xmm6, %xmm0 +; SSE2-NEXT: addps %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: addps %xmm6, %xmm1 +; SSE2-NEXT: addps %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_8i32_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1392508928,1392508928,1392508928,1392508928] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] +; SSE41-NEXT: movaps {{.*#+}} xmm5 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11] +; SSE41-NEXT: addps %xmm5, %xmm0 +; SSE41-NEXT: addps %xmm3, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] +; SSE41-NEXT: addps %xmm5, %xmm1 +; SSE41-NEXT: addps %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_8i32_to_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX1-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_8i32_to_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11] +; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: uitofp_8i32_to_8f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_8i32_to_8f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtudq2ps %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_8i32_to_8f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_8i32_to_8f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtudq2ps %ymm0, %ymm0 +; AVX512VLDQ-NEXT: retq + %cvt = uitofp <8 x i32> %a to <8 x float> + ret <8 x float> %cvt +} + +define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) { +; SSE2-LABEL: uitofp_8i16_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: cvtdq2ps %xmm2, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_8i16_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_8i16_to_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_8i16_to_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: uitofp_8i16_to_8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX512-NEXT: retq + %cvt = uitofp <8 x i16> %a to <8 x float> + ret <8 x float> %cvt +} + +define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) { +; SSE2-LABEL: uitofp_8i8_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: cvtdq2ps %xmm2, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_8i8_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_8i8_to_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_8i8_to_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: uitofp_8i8_to_8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX512-NEXT: retq + %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> + %cvt = uitofp <8 x i8> %shuf to <8 x float> + ret <8 x float> %cvt +} + +define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) { +; SSE2-LABEL: uitofp_16i8_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: cvtdq2ps %xmm2, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_16i8_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_16i8_to_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_16i8_to_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: uitofp_16i8_to_8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512-NEXT: retq + %cvt = uitofp <16 x i8> %a to <16 x float> + %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> + ret <8 x float> %shuf +} + +; +; Load Signed Integer to Double +; + +define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) { +; SSE2-LABEL: sitofp_load_2i64_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: cvtsi2sdq %rax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2sdq %rax, %xmm1 +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_2i64_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2sdq %rax, %xmm1 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2sdq %rax, %xmm0 +; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: retq +; +; VEX-LABEL: sitofp_load_2i64_to_2f64: +; VEX: # %bb.0: +; VEX-NEXT: vmovdqa (%rdi), %xmm0 +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1 +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0 +; VEX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; VEX-NEXT: retq +; +; AVX512F-LABEL: sitofp_load_2i64_to_2f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0 +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: sitofp_load_2i64_to_2f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0 +; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: sitofp_load_2i64_to_2f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: sitofp_load_2i64_to_2f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtqq2pd (%rdi), %xmm0 +; AVX512VLDQ-NEXT: retq + %ld = load <2 x i64>, <2 x i64> *%a + %cvt = sitofp <2 x i64> %ld to <2 x double> + ret <2 x double> %cvt +} + +define <2 x double> @sitofp_load_2i32_to_2f64(<2 x i32> *%a) { +; SSE-LABEL: sitofp_load_2i32_to_2f64: +; SSE: # %bb.0: +; SSE-NEXT: cvtdq2pd (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_load_2i32_to_2f64: +; AVX: # %bb.0: +; AVX-NEXT: vcvtdq2pd (%rdi), %xmm0 +; AVX-NEXT: retq + %ld = load <2 x i32>, <2 x i32> *%a + %cvt = sitofp <2 x i32> %ld to <2 x double> + ret <2 x double> %cvt +} + +define <2 x double> @sitofp_load_2i16_to_2f64(<2 x i16> *%a) { +; SSE2-LABEL: sitofp_load_2i16_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_2i16_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movswl 2(%rdi), %eax +; SSE41-NEXT: movswl (%rdi), %ecx +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: pinsrd $1, %eax, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sitofp_load_2i16_to_2f64: +; AVX: # %bb.0: +; AVX-NEXT: movswl 2(%rdi), %eax +; AVX-NEXT: movswl (%rdi), %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load <2 x i16>, <2 x i16> *%a + %cvt = sitofp <2 x i16> %ld to <2 x double> + ret <2 x double> %cvt +} + +define <2 x double> @sitofp_load_2i8_to_2f64(<2 x i8> *%a) { +; SSE2-LABEL: sitofp_load_2i8_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movzwl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_2i8_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movsbl 1(%rdi), %eax +; SSE41-NEXT: movsbl (%rdi), %ecx +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: pinsrd $1, %eax, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sitofp_load_2i8_to_2f64: +; AVX: # %bb.0: +; AVX-NEXT: movsbl 1(%rdi), %eax +; AVX-NEXT: movsbl (%rdi), %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load <2 x i8>, <2 x i8> *%a + %cvt = sitofp <2 x i8> %ld to <2 x double> + ret <2 x double> %cvt +} + +define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) { +; SSE2-LABEL: sitofp_load_4i64_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: movdqa 16(%rdi), %xmm2 +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: cvtsi2sdq %rax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2sdq %rax, %xmm1 +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2sdq %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: cvtsi2sdq %rax, %xmm2 +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_4i64_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2sdq %rax, %xmm2 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2sdq %rax, %xmm0 +; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2sdq %rax, %xmm2 +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2sdq %rax, %xmm1 +; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE41-NEXT: retq +; +; VEX-LABEL: sitofp_load_4i64_to_4f64: +; VEX: # %bb.0: +; VEX-NEXT: vmovdqa (%rdi), %xmm0 +; VEX-NEXT: vmovdqa 16(%rdi), %xmm1 +; VEX-NEXT: vpextrq $1, %xmm1, %rax +; VEX-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; VEX-NEXT: vmovq %xmm1, %rax +; VEX-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 +; VEX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 +; VEX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; VEX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; VEX-NEXT: retq +; +; AVX512F-LABEL: sitofp_load_4i64_to_4f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax +; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vmovq %xmm1, %rax +; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: sitofp_load_4i64_to_4f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax +; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vmovq %xmm1, %rax +; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 +; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 +; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: sitofp_load_4i64_to_4f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 +; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: sitofp_load_4i64_to_4f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtqq2pd (%rdi), %ymm0 +; AVX512VLDQ-NEXT: retq + %ld = load <4 x i64>, <4 x i64> *%a + %cvt = sitofp <4 x i64> %ld to <4 x double> + ret <4 x double> %cvt +} + +define <4 x double> @sitofp_load_4i32_to_4f64(<4 x i32> *%a) { +; SSE-LABEL: sitofp_load_4i32_to_4f64: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_load_4i32_to_4f64: +; AVX: # %bb.0: +; AVX-NEXT: vcvtdq2pd (%rdi), %ymm0 +; AVX-NEXT: retq + %ld = load <4 x i32>, <4 x i32> *%a + %cvt = sitofp <4 x i32> %ld to <4 x double> + ret <4 x double> %cvt +} + +define <4 x double> @sitofp_load_4i16_to_4f64(<4 x i16> *%a) { +; SSE2-LABEL: sitofp_load_4i16_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_4i16_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd (%rdi), %xmm1 +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; AVX-LABEL: sitofp_load_4i16_to_4f64: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxwd (%rdi), %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX-NEXT: retq + %ld = load <4 x i16>, <4 x i16> *%a + %cvt = sitofp <4 x i16> %ld to <4 x double> + ret <4 x double> %cvt +} + +define <4 x double> @sitofp_load_4i8_to_4f64(<4 x i8> *%a) { +; SSE2-LABEL: sitofp_load_4i8_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_4i8_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd (%rdi), %xmm1 +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; AVX-LABEL: sitofp_load_4i8_to_4f64: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxbd (%rdi), %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX-NEXT: retq + %ld = load <4 x i8>, <4 x i8> *%a + %cvt = sitofp <4 x i8> %ld to <4 x double> + ret <4 x double> %cvt +} + +; +; Load Unsigned Integer to Double +; + +define <2 x double> @uitofp_load_2i64_to_2f64(<2 x i64> *%a) { +; SSE2-LABEL: uitofp_load_2i64_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: por {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psrlq $32, %xmm0 +; SSE2-NEXT: por {{.*}}(%rip), %xmm0 +; SSE2-NEXT: subpd {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_2i64_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; SSE41-NEXT: por {{.*}}(%rip), %xmm1 +; SSE41-NEXT: psrlq $32, %xmm0 +; SSE41-NEXT: por {{.*}}(%rip), %xmm0 +; SSE41-NEXT: subpd {{.*}}(%rip), %xmm0 +; SSE41-NEXT: addpd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_load_2i64_to_2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_load_2i64_to_2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: uitofp_load_2i64_to_2f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_load_2i64_to_2f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 +; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_load_2i64_to_2f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_load_2i64_to_2f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtuqq2pd (%rdi), %xmm0 +; AVX512VLDQ-NEXT: retq + %ld = load <2 x i64>, <2 x i64> *%a + %cvt = uitofp <2 x i64> %ld to <2 x double> + ret <2 x double> %cvt +} + +define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) { +; SSE2-LABEL: uitofp_load_2i32_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_2i32_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE41-NEXT: addpd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; VEX-LABEL: uitofp_load_2i32_to_2f64: +; VEX: # %bb.0: +; VEX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7] +; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1 +; VEX-NEXT: vpsrld $16, %xmm0, %xmm0 +; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0 +; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; VEX-NEXT: retq +; +; AVX512F-LABEL: uitofp_load_2i32_to_2f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_load_2i32_to_2f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtudq2pd (%rdi), %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_load_2i32_to_2f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_load_2i32_to_2f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtudq2pd (%rdi), %xmm0 +; AVX512VLDQ-NEXT: retq + %ld = load <2 x i32>, <2 x i32> *%a + %cvt = uitofp <2 x i32> %ld to <2 x double> + ret <2 x double> %cvt +} + +define <2 x double> @uitofp_load_2i16_to_2f64(<2 x i16> *%a) { +; SSE2-LABEL: uitofp_load_2i16_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_2i16_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: uitofp_load_2i16_to_2f64: +; AVX: # %bb.0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load <2 x i16>, <2 x i16> *%a + %cvt = uitofp <2 x i16> %ld to <2 x double> + ret <2 x double> %cvt +} + +define <2 x double> @uitofp_load_2i8_to_2f64(<2 x i8> *%a) { +; SSE2-LABEL: uitofp_load_2i8_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movzwl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_2i8_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movzbl 1(%rdi), %eax +; SSE41-NEXT: movzbl (%rdi), %ecx +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: pinsrd $1, %eax, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: uitofp_load_2i8_to_2f64: +; AVX: # %bb.0: +; AVX-NEXT: movzbl 1(%rdi), %eax +; AVX-NEXT: movzbl (%rdi), %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load <2 x i8>, <2 x i8> *%a + %cvt = uitofp <2 x i8> %ld to <2 x double> + ret <2 x double> %cvt +} + +define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) { +; SSE2-LABEL: uitofp_load_4i64_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: psrlq $32, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; SSE2-NEXT: subpd %xmm6, %xmm0 +; SSE2-NEXT: addpd %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: psrlq $32, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: subpd %xmm6, %xmm1 +; SSE2-NEXT: addpd %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_4i64_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] +; SSE41-NEXT: por %xmm4, %xmm3 +; SSE41-NEXT: psrlq $32, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; SSE41-NEXT: subpd %xmm6, %xmm0 +; SSE41-NEXT: addpd %xmm3, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE41-NEXT: por %xmm4, %xmm2 +; SSE41-NEXT: psrlq $32, %xmm1 +; SSE41-NEXT: por %xmm5, %xmm1 +; SSE41-NEXT: subpd %xmm6, %xmm1 +; SSE41-NEXT: addpd %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_load_4i64_to_4f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4],ymm0[5],mem[6],ymm0[7] +; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vorpd {{.*}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vsubpd {{.*}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_load_4i64_to_4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] +; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] +; AVX2-NEXT: vsubpd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: uitofp_load_4i64_to_4f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] +; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] +; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] +; AVX512F-NEXT: vsubpd %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_load_4i64_to_4f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm1 +; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512VL-NEXT: vsubpd {{.*}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512VL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_load_4i64_to_4f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 +; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_load_4i64_to_4f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtuqq2pd (%rdi), %ymm0 +; AVX512VLDQ-NEXT: retq + %ld = load <4 x i64>, <4 x i64> *%a + %cvt = uitofp <4 x i64> %ld to <4 x double> + ret <4 x double> %cvt +} + +define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) { +; SSE2-LABEL: uitofp_load_4i32_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4] +; SSE2-NEXT: mulpd %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm4 +; SSE2-NEXT: mulpd %xmm2, %xmm4 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE2-NEXT: cvtdq2pd %xmm3, %xmm1 +; SSE2-NEXT: addpd %xmm4, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_4i32_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4] +; SSE41-NEXT: mulpd %xmm2, %xmm1 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7] +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: addpd %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3,4,5,6,7] +; SSE41-NEXT: psrld $16, %xmm4 +; SSE41-NEXT: cvtdq2pd %xmm4, %xmm4 +; SSE41-NEXT: mulpd %xmm2, %xmm4 +; SSE41-NEXT: cvtdq2pd %xmm3, %xmm1 +; SSE41-NEXT: addpd %xmm4, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_load_4i32_to_4f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_load_4i32_to_4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4] +; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: uitofp_load_4i32_to_4f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps (%rdi), %xmm0 +; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_load_4i32_to_4f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtudq2pd (%rdi), %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_load_4i32_to_4f64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_load_4i32_to_4f64: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtudq2pd (%rdi), %ymm0 +; AVX512VLDQ-NEXT: retq + %ld = load <4 x i32>, <4 x i32> *%a + %cvt = uitofp <4 x i32> %ld to <4 x double> + ret <4 x double> %cvt +} + +define <4 x double> @uitofp_load_4i16_to_4f64(<4 x i16> *%a) { +; SSE2-LABEL: uitofp_load_4i16_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_4i16_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; AVX-LABEL: uitofp_load_4i16_to_4f64: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX-NEXT: retq + %ld = load <4 x i16>, <4 x i16> *%a + %cvt = uitofp <4 x i16> %ld to <4 x double> + ret <4 x double> %cvt +} + +define <4 x double> @uitofp_load_4i8_to_4f64(<4 x i8> *%a) { +; SSE2-LABEL: uitofp_load_4i8_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_4i8_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; AVX-LABEL: uitofp_load_4i8_to_4f64: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX-NEXT: retq + %ld = load <4 x i8>, <4 x i8> *%a + %cvt = uitofp <4 x i8> %ld to <4 x double> + ret <4 x double> %cvt +} + +; +; Load Signed Integer to Float +; + +define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) { +; SSE2-LABEL: sitofp_load_4i64_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_4i64_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; SSE41-NEXT: retq +; +; VEX-LABEL: sitofp_load_4i64_to_4f32: +; VEX: # %bb.0: +; VEX-NEXT: vmovdqa (%rdi), %xmm0 +; VEX-NEXT: vmovdqa 16(%rdi), %xmm1 +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; VEX-NEXT: vmovq %xmm1, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; VEX-NEXT: vpextrq $1, %xmm1, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; VEX-NEXT: retq +; +; AVX512F-LABEL: sitofp_load_4i64_to_4f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX512F-NEXT: vmovq %xmm1, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: sitofp_load_4i64_to_4f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX512VL-NEXT: vmovq %xmm1, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: sitofp_load_4i64_to_4f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 +; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: sitofp_load_4i64_to_4f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtqq2psy (%rdi), %xmm0 +; AVX512VLDQ-NEXT: retq + %ld = load <4 x i64>, <4 x i64> *%a + %cvt = sitofp <4 x i64> %ld to <4 x float> + ret <4 x float> %cvt +} + +define <4 x float> @sitofp_load_4i32_to_4f32(<4 x i32> *%a) { +; SSE-LABEL: sitofp_load_4i32_to_4f32: +; SSE: # %bb.0: +; SSE-NEXT: cvtdq2ps (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_load_4i32_to_4f32: +; AVX: # %bb.0: +; AVX-NEXT: vcvtdq2ps (%rdi), %xmm0 +; AVX-NEXT: retq + %ld = load <4 x i32>, <4 x i32> *%a + %cvt = sitofp <4 x i32> %ld to <4 x float> + ret <4 x float> %cvt +} + +define <4 x float> @sitofp_load_4i16_to_4f32(<4 x i16> *%a) { +; SSE2-LABEL: sitofp_load_4i16_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_4i16_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd (%rdi), %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sitofp_load_4i16_to_4f32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxwd (%rdi), %xmm0 +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load <4 x i16>, <4 x i16> *%a + %cvt = sitofp <4 x i16> %ld to <4 x float> + ret <4 x float> %cvt +} + +define <4 x float> @sitofp_load_4i8_to_4f32(<4 x i8> *%a) { +; SSE2-LABEL: sitofp_load_4i8_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_4i8_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd (%rdi), %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sitofp_load_4i8_to_4f32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxbd (%rdi), %xmm0 +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load <4 x i8>, <4 x i8> *%a + %cvt = sitofp <4 x i8> %ld to <4 x float> + ret <4 x float> %cvt +} + +define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) { +; SSE2-LABEL: sitofp_load_8i64_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm3 +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: cvtsi2ssq %rax, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE2-NEXT: movq %xmm3, %rax +; SSE2-NEXT: xorps %xmm4, %xmm4 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_8i64_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: movdqa 32(%rdi), %xmm2 +; SSE41-NEXT: movdqa 48(%rdi), %xmm3 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2ssq %rax, %xmm4 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3] +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: xorps %xmm4, %xmm4 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm4 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0],xmm0[3] +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; SSE41-NEXT: pextrq $1, %xmm2, %rax +; SSE41-NEXT: xorps %xmm4, %xmm4 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm4 +; SSE41-NEXT: movq %xmm2, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2,3] +; SSE41-NEXT: movq %xmm3, %rax +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; SSE41-NEXT: pextrq $1, %xmm3, %rax +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; SSE41-NEXT: retq +; +; VEX-LABEL: sitofp_load_8i64_to_8f32: +; VEX: # %bb.0: +; VEX-NEXT: vmovdqa (%rdi), %xmm0 +; VEX-NEXT: vmovdqa 16(%rdi), %xmm1 +; VEX-NEXT: vmovdqa 32(%rdi), %xmm2 +; VEX-NEXT: vmovdqa 48(%rdi), %xmm3 +; VEX-NEXT: vpextrq $1, %xmm2, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 +; VEX-NEXT: vmovq %xmm2, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 +; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] +; VEX-NEXT: vmovq %xmm3, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4 +; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; VEX-NEXT: vpextrq $1, %xmm3, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; VEX-NEXT: vmovq %xmm1, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] +; VEX-NEXT: vpextrq $1, %xmm1, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm1 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; VEX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; VEX-NEXT: retq +; +; AVX512F-LABEL: sitofp_load_8i64_to_8f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512F-NEXT: vpextrq $1, %xmm2, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 +; AVX512F-NEXT: vmovq %xmm2, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] +; AVX512F-NEXT: vmovq %xmm3, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; AVX512F-NEXT: vpextrq $1, %xmm3, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; AVX512F-NEXT: vmovq %xmm1, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm1 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: sitofp_load_8i64_to_8f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 +; AVX512VL-NEXT: vmovq %xmm2, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] +; AVX512VL-NEXT: vmovq %xmm3, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; AVX512VL-NEXT: vpextrq $1, %xmm3, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm0 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; AVX512VL-NEXT: vmovq %xmm1, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm1 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: sitofp_load_8i64_to_8f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvtqq2ps (%rdi), %ymm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: sitofp_load_8i64_to_8f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtqq2ps (%rdi), %ymm0 +; AVX512VLDQ-NEXT: retq + %ld = load <8 x i64>, <8 x i64> *%a + %cvt = sitofp <8 x i64> %ld to <8 x float> + ret <8 x float> %cvt +} + +define <8 x float> @sitofp_load_8i32_to_8f32(<8 x i32> *%a) { +; SSE-LABEL: sitofp_load_8i32_to_8f32: +; SSE: # %bb.0: +; SSE-NEXT: cvtdq2ps (%rdi), %xmm0 +; SSE-NEXT: cvtdq2ps 16(%rdi), %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_load_8i32_to_8f32: +; AVX: # %bb.0: +; AVX-NEXT: vcvtdq2ps (%rdi), %ymm0 +; AVX-NEXT: retq + %ld = load <8 x i32>, <8 x i32> *%a + %cvt = sitofp <8 x i32> %ld to <8 x float> + ret <8 x float> %cvt +} + +define <8 x float> @sitofp_load_8i16_to_8f32(<8 x i16> *%a) { +; SSE2-LABEL: sitofp_load_8i16_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_8i16_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd 8(%rdi), %xmm1 +; SSE41-NEXT: pmovsxwd (%rdi), %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sitofp_load_8i16_to_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm0 +; AVX1-NEXT: vpmovsxwd (%rdi), %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sitofp_load_8i16_to_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxwd (%rdi), %ymm0 +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sitofp_load_8i16_to_8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxwd (%rdi), %ymm0 +; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX512-NEXT: retq + %ld = load <8 x i16>, <8 x i16> *%a + %cvt = sitofp <8 x i16> %ld to <8 x float> + ret <8 x float> %cvt +} + +define <8 x float> @sitofp_load_8i8_to_8f32(<8 x i8> *%a) { +; SSE2-LABEL: sitofp_load_8i8_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_8i8_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd 4(%rdi), %xmm1 +; SSE41-NEXT: pmovsxbd (%rdi), %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sitofp_load_8i8_to_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxbw (%rdi), %xmm0 +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sitofp_load_8i8_to_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxbd (%rdi), %ymm0 +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sitofp_load_8i8_to_8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxbd (%rdi), %ymm0 +; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX512-NEXT: retq + %ld = load <8 x i8>, <8 x i8> *%a + %cvt = sitofp <8 x i8> %ld to <8 x float> + ret <8 x float> %cvt +} + +; +; Load Unsigned Integer to Float +; + +define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { +; SSE2-LABEL: uitofp_load_4i64_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm2 +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB76_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: jmp .LBB76_3 +; SSE2-NEXT: .LBB76_1: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: addss %xmm1, %xmm1 +; SSE2-NEXT: .LBB76_3: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB76_4 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: cvtsi2ssq %rax, %xmm3 +; SSE2-NEXT: jmp .LBB76_6 +; SSE2-NEXT: .LBB76_4: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: cvtsi2ssq %rax, %xmm3 +; SSE2-NEXT: addss %xmm3, %xmm3 +; SSE2-NEXT: .LBB76_6: +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB76_7 +; SSE2-NEXT: # %bb.8: +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: jmp .LBB76_9 +; SSE2-NEXT: .LBB76_7: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: addss %xmm0, %xmm0 +; SSE2-NEXT: .LBB76_9: +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB76_10 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE2-NEXT: jmp .LBB76_12 +; SSE2-NEXT: .LBB76_10: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE2-NEXT: addss %xmm2, %xmm2 +; SSE2-NEXT: .LBB76_12: +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_4i64_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB76_1 +; SSE41-NEXT: # %bb.2: +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: jmp .LBB76_3 +; SSE41-NEXT: .LBB76_1: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: addss %xmm2, %xmm2 +; SSE41-NEXT: .LBB76_3: +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB76_4 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: jmp .LBB76_6 +; SSE41-NEXT: .LBB76_4: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: addss %xmm0, %xmm0 +; SSE41-NEXT: .LBB76_6: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB76_7 +; SSE41-NEXT: # %bb.8: +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: jmp .LBB76_9 +; SSE41-NEXT: .LBB76_7: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: addss %xmm2, %xmm2 +; SSE41-NEXT: .LBB76_9: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB76_10 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; SSE41-NEXT: retq +; SSE41-NEXT: .LBB76_10: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: addss %xmm1, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; SSE41-NEXT: retq +; +; VEX-LABEL: uitofp_load_4i64_to_4f32: +; VEX: # %bb.0: +; VEX-NEXT: vmovdqa (%rdi), %xmm2 +; VEX-NEXT: vmovdqa 16(%rdi), %xmm0 +; VEX-NEXT: vpextrq $1, %xmm2, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB76_1 +; VEX-NEXT: # %bb.2: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; VEX-NEXT: jmp .LBB76_3 +; VEX-NEXT: .LBB76_1: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; VEX-NEXT: .LBB76_3: +; VEX-NEXT: vmovq %xmm2, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB76_4 +; VEX-NEXT: # %bb.5: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; VEX-NEXT: jmp .LBB76_6 +; VEX-NEXT: .LBB76_4: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; VEX-NEXT: .LBB76_6: +; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB76_7 +; VEX-NEXT: # %bb.8: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; VEX-NEXT: jmp .LBB76_9 +; VEX-NEXT: .LBB76_7: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; VEX-NEXT: .LBB76_9: +; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB76_10 +; VEX-NEXT: # %bb.11: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; VEX-NEXT: retq +; VEX-NEXT: .LBB76_10: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; VEX-NEXT: retq +; +; AVX512F-LABEL: uitofp_load_4i64_to_4f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX512F-NEXT: vmovq %xmm1, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_load_4i64_to_4f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX512VL-NEXT: vmovq %xmm1, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_load_4i64_to_4f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 +; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_load_4i64_to_4f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtuqq2psy (%rdi), %xmm0 +; AVX512VLDQ-NEXT: retq + %ld = load <4 x i64>, <4 x i64> *%a + %cvt = uitofp <4 x i64> %ld to <4 x float> + ret <4 x float> %cvt +} + +define <4 x float> @uitofp_load_4i32_to_4f32(<4 x i32> *%a) { +; SSE2-LABEL: uitofp_load_4i32_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: por {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: por {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_4i32_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; SSE41-NEXT: addps {{.*}}(%rip), %xmm0 +; SSE41-NEXT: addps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_load_4i32_to_4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_load_4i32_to_4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200] +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1392508928,1392508928,1392508928,1392508928] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11] +; AVX2-NEXT: vaddps %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: uitofp_load_4i32_to_4f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps (%rdi), %xmm0 +; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_load_4i32_to_4f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtudq2ps (%rdi), %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_load_4i32_to_4f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_load_4i32_to_4f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtudq2ps (%rdi), %xmm0 +; AVX512VLDQ-NEXT: retq + %ld = load <4 x i32>, <4 x i32> *%a + %cvt = uitofp <4 x i32> %ld to <4 x float> + ret <4 x float> %cvt +} + +define <4 x float> @uitofp_load_4i16_to_4f32(<4 x i16> *%a) { +; SSE2-LABEL: uitofp_load_4i16_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_4i16_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: uitofp_load_4i16_to_4f32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load <4 x i16>, <4 x i16> *%a + %cvt = uitofp <4 x i16> %ld to <4 x float> + ret <4 x float> %cvt +} + +define <4 x float> @uitofp_load_4i8_to_4f32(<4 x i8> *%a) { +; SSE2-LABEL: uitofp_load_4i8_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_4i8_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: uitofp_load_4i8_to_4f32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load <4 x i8>, <4 x i8> *%a + %cvt = uitofp <4 x i8> %ld to <4 x float> + ret <4 x float> %cvt +} + +define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { +; SSE2-LABEL: uitofp_load_8i64_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm5 +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB80_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: cvtsi2ssq %rax, %xmm3 +; SSE2-NEXT: jmp .LBB80_3 +; SSE2-NEXT: .LBB80_1: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: cvtsi2ssq %rax, %xmm3 +; SSE2-NEXT: addss %xmm3, %xmm3 +; SSE2-NEXT: .LBB80_3: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB80_4 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: cvtsi2ssq %rax, %xmm4 +; SSE2-NEXT: jmp .LBB80_6 +; SSE2-NEXT: .LBB80_4: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: cvtsi2ssq %rax, %xmm4 +; SSE2-NEXT: addss %xmm4, %xmm4 +; SSE2-NEXT: .LBB80_6: +; SSE2-NEXT: movq %xmm5, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB80_7 +; SSE2-NEXT: # %bb.8: +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: jmp .LBB80_9 +; SSE2-NEXT: .LBB80_7: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: addss %xmm0, %xmm0 +; SSE2-NEXT: .LBB80_9: +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] +; SSE2-NEXT: movq %xmm5, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB80_10 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: cvtsi2ssq %rax, %xmm6 +; SSE2-NEXT: jmp .LBB80_12 +; SSE2-NEXT: .LBB80_10: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: cvtsi2ssq %rax, %xmm6 +; SSE2-NEXT: addss %xmm6, %xmm6 +; SSE2-NEXT: .LBB80_12: +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB80_13 +; SSE2-NEXT: # %bb.14: +; SSE2-NEXT: xorps %xmm5, %xmm5 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm5 +; SSE2-NEXT: jmp .LBB80_15 +; SSE2-NEXT: .LBB80_13: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm5, %xmm5 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm5 +; SSE2-NEXT: addss %xmm5, %xmm5 +; SSE2-NEXT: .LBB80_15: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB80_16 +; SSE2-NEXT: # %bb.17: +; SSE2-NEXT: cvtsi2ssq %rax, %xmm7 +; SSE2-NEXT: jmp .LBB80_18 +; SSE2-NEXT: .LBB80_16: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: cvtsi2ssq %rax, %xmm7 +; SSE2-NEXT: addss %xmm7, %xmm7 +; SSE2-NEXT: .LBB80_18: +; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB80_19 +; SSE2-NEXT: # %bb.20: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: jmp .LBB80_21 +; SSE2-NEXT: .LBB80_19: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: addss %xmm1, %xmm1 +; SSE2-NEXT: .LBB80_21: +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE2-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB80_22 +; SSE2-NEXT: # %bb.23: +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE2-NEXT: jmp .LBB80_24 +; SSE2-NEXT: .LBB80_22: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE2-NEXT: addss %xmm2, %xmm2 +; SSE2-NEXT: .LBB80_24: +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_8i64_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm4 +; SSE41-NEXT: movdqa 32(%rdi), %xmm1 +; SSE41-NEXT: movdqa 48(%rdi), %xmm2 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB80_1 +; SSE41-NEXT: # %bb.2: +; SSE41-NEXT: cvtsi2ssq %rax, %xmm3 +; SSE41-NEXT: jmp .LBB80_3 +; SSE41-NEXT: .LBB80_1: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: cvtsi2ssq %rax, %xmm3 +; SSE41-NEXT: addss %xmm3, %xmm3 +; SSE41-NEXT: .LBB80_3: +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB80_4 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: jmp .LBB80_6 +; SSE41-NEXT: .LBB80_4: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: addss %xmm0, %xmm0 +; SSE41-NEXT: .LBB80_6: +; SSE41-NEXT: movq %xmm4, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB80_7 +; SSE41-NEXT: # %bb.8: +; SSE41-NEXT: cvtsi2ssq %rax, %xmm5 +; SSE41-NEXT: jmp .LBB80_9 +; SSE41-NEXT: .LBB80_7: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: cvtsi2ssq %rax, %xmm5 +; SSE41-NEXT: addss %xmm5, %xmm5 +; SSE41-NEXT: .LBB80_9: +; SSE41-NEXT: pextrq $1, %xmm4, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB80_10 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: xorps %xmm4, %xmm4 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm4 +; SSE41-NEXT: jmp .LBB80_12 +; SSE41-NEXT: .LBB80_10: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm4, %xmm4 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm4 +; SSE41-NEXT: addss %xmm4, %xmm4 +; SSE41-NEXT: .LBB80_12: +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB80_13 +; SSE41-NEXT: # %bb.14: +; SSE41-NEXT: cvtsi2ssq %rax, %xmm6 +; SSE41-NEXT: jmp .LBB80_15 +; SSE41-NEXT: .LBB80_13: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: cvtsi2ssq %rax, %xmm6 +; SSE41-NEXT: addss %xmm6, %xmm6 +; SSE41-NEXT: .LBB80_15: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB80_16 +; SSE41-NEXT: # %bb.17: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: jmp .LBB80_18 +; SSE41-NEXT: .LBB80_16: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: addss %xmm1, %xmm1 +; SSE41-NEXT: .LBB80_18: +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[2,3] +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm5[0],xmm0[3] +; SSE41-NEXT: movq %xmm2, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB80_19 +; SSE41-NEXT: # %bb.20: +; SSE41-NEXT: xorps %xmm3, %xmm3 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm3 +; SSE41-NEXT: jmp .LBB80_21 +; SSE41-NEXT: .LBB80_19: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm3, %xmm3 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm3 +; SSE41-NEXT: addss %xmm3, %xmm3 +; SSE41-NEXT: .LBB80_21: +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] +; SSE41-NEXT: pextrq $1, %xmm2, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB80_22 +; SSE41-NEXT: # %bb.23: +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; SSE41-NEXT: retq +; SSE41-NEXT: .LBB80_22: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: addss %xmm2, %xmm2 +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; SSE41-NEXT: retq +; +; VEX-LABEL: uitofp_load_8i64_to_8f32: +; VEX: # %bb.0: +; VEX-NEXT: vmovdqa (%rdi), %xmm1 +; VEX-NEXT: vmovdqa 16(%rdi), %xmm0 +; VEX-NEXT: vmovdqa 32(%rdi), %xmm4 +; VEX-NEXT: vmovdqa 48(%rdi), %xmm3 +; VEX-NEXT: vpextrq $1, %xmm4, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB80_1 +; VEX-NEXT: # %bb.2: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; VEX-NEXT: jmp .LBB80_3 +; VEX-NEXT: .LBB80_1: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; VEX-NEXT: .LBB80_3: +; VEX-NEXT: vmovq %xmm4, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB80_4 +; VEX-NEXT: # %bb.5: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 +; VEX-NEXT: jmp .LBB80_6 +; VEX-NEXT: .LBB80_4: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4 +; VEX-NEXT: vaddss %xmm4, %xmm4, %xmm5 +; VEX-NEXT: .LBB80_6: +; VEX-NEXT: vmovq %xmm3, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB80_7 +; VEX-NEXT: # %bb.8: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4 +; VEX-NEXT: jmp .LBB80_9 +; VEX-NEXT: .LBB80_7: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4 +; VEX-NEXT: vaddss %xmm4, %xmm4, %xmm4 +; VEX-NEXT: .LBB80_9: +; VEX-NEXT: vpextrq $1, %xmm3, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB80_10 +; VEX-NEXT: # %bb.11: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 +; VEX-NEXT: jmp .LBB80_12 +; VEX-NEXT: .LBB80_10: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 +; VEX-NEXT: vaddss %xmm3, %xmm3, %xmm3 +; VEX-NEXT: .LBB80_12: +; VEX-NEXT: vpextrq $1, %xmm1, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB80_13 +; VEX-NEXT: # %bb.14: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm6 +; VEX-NEXT: jmp .LBB80_15 +; VEX-NEXT: .LBB80_13: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm6 +; VEX-NEXT: vaddss %xmm6, %xmm6, %xmm6 +; VEX-NEXT: .LBB80_15: +; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[2,3] +; VEX-NEXT: vmovq %xmm1, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB80_16 +; VEX-NEXT: # %bb.17: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm7, %xmm1 +; VEX-NEXT: jmp .LBB80_18 +; VEX-NEXT: .LBB80_16: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm7, %xmm1 +; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; VEX-NEXT: .LBB80_18: +; VEX-NEXT: vinsertps {{.*#+}} xmm5 = xmm1[0],xmm6[0],xmm1[2,3] +; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm4[0],xmm2[3] +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB80_19 +; VEX-NEXT: # %bb.20: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm7, %xmm2 +; VEX-NEXT: jmp .LBB80_21 +; VEX-NEXT: .LBB80_19: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm7, %xmm2 +; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; VEX-NEXT: .LBB80_21: +; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm5[0,1],xmm2[0],xmm5[3] +; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB80_22 +; VEX-NEXT: # %bb.23: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm7, %xmm0 +; VEX-NEXT: jmp .LBB80_24 +; VEX-NEXT: .LBB80_22: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm7, %xmm0 +; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; VEX-NEXT: .LBB80_24: +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; VEX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; VEX-NEXT: retq +; +; AVX512F-LABEL: uitofp_load_8i64_to_8f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512F-NEXT: vpextrq $1, %xmm2, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm4 +; AVX512F-NEXT: vmovq %xmm2, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] +; AVX512F-NEXT: vmovq %xmm3, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; AVX512F-NEXT: vpextrq $1, %xmm3, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; AVX512F-NEXT: vmovq %xmm1, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm1 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_load_8i64_to_8f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm4 +; AVX512VL-NEXT: vmovq %xmm2, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm2 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] +; AVX512VL-NEXT: vmovq %xmm3, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; AVX512VL-NEXT: vpextrq $1, %xmm3, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm0 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; AVX512VL-NEXT: vmovq %xmm1, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm1 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_load_8i64_to_8f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvtuqq2ps (%rdi), %ymm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_load_8i64_to_8f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtuqq2ps (%rdi), %ymm0 +; AVX512VLDQ-NEXT: retq + %ld = load <8 x i64>, <8 x i64> *%a + %cvt = uitofp <8 x i64> %ld to <8 x float> + ret <8 x float> %cvt +} + +define <8 x float> @uitofp_load_8i32_to_8f32(<8 x i32> *%a) { +; SSE2-LABEL: uitofp_load_8i32_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200] +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928] +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movaps {{.*#+}} xmm6 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11] +; SSE2-NEXT: addps %xmm6, %xmm0 +; SSE2-NEXT: addps %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: addps %xmm6, %xmm1 +; SSE2-NEXT: addps %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_8i32_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1392508928,1392508928,1392508928,1392508928] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] +; SSE41-NEXT: movaps {{.*#+}} xmm5 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11] +; SSE41-NEXT: addps %xmm5, %xmm0 +; SSE41-NEXT: addps %xmm3, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] +; SSE41-NEXT: addps %xmm5, %xmm1 +; SSE41-NEXT: addps %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_load_8i32_to_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX1-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_load_8i32_to_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11] +; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: uitofp_load_8i32_to_8f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps (%rdi), %ymm0 +; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: uitofp_load_8i32_to_8f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtudq2ps (%rdi), %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_load_8i32_to_8f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 +; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: uitofp_load_8i32_to_8f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtudq2ps (%rdi), %ymm0 +; AVX512VLDQ-NEXT: retq + %ld = load <8 x i32>, <8 x i32> *%a + %cvt = uitofp <8 x i32> %ld to <8 x float> + ret <8 x float> %cvt +} + +define <8 x float> @uitofp_load_8i16_to_8f32(<8 x i16> *%a) { +; SSE2-LABEL: uitofp_load_8i16_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_8i16_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_load_8i16_to_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_load_8i16_to_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: uitofp_load_8i16_to_8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX512-NEXT: retq + %ld = load <8 x i16>, <8 x i16> *%a + %cvt = uitofp <8 x i16> %ld to <8 x float> + ret <8 x float> %cvt +} + +define <8 x float> @uitofp_load_8i8_to_8f32(<8 x i8> *%a) { +; SSE2-LABEL: uitofp_load_8i8_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_8i8_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: uitofp_load_8i8_to_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_load_8i8_to_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: uitofp_load_8i8_to_8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX512-NEXT: retq + %ld = load <8 x i8>, <8 x i8> *%a + %cvt = uitofp <8 x i8> %ld to <8 x float> + ret <8 x float> %cvt +} + +; +; Aggregates +; + +%Arguments = type <{ <8 x i8>, <8 x i16>, <8 x float>* }> +define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) { +; SSE2-LABEL: aggregate_sitofp_8i16_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq 24(%rdi), %rax +; SSE2-NEXT: movdqu 8(%rdi), %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: movaps %xmm0, 16(%rax) +; SSE2-NEXT: movaps %xmm1, (%rax) +; SSE2-NEXT: retq +; +; SSE41-LABEL: aggregate_sitofp_8i16_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movq 24(%rdi), %rax +; SSE41-NEXT: movdqu 8(%rdi), %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovsxwd %xmm1, %xmm1 +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, (%rax) +; SSE41-NEXT: movaps %xmm1, 16(%rax) +; SSE41-NEXT: retq +; +; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: movq 24(%rdi), %rax +; AVX1-NEXT: vpmovsxwd 16(%rdi), %xmm0 +; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rax) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: aggregate_sitofp_8i16_to_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: movq 24(%rdi), %rax +; AVX2-NEXT: vpmovsxwd 8(%rdi), %ymm0 +; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX2-NEXT: vmovaps %ymm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: aggregate_sitofp_8i16_to_8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: movq 24(%rdi), %rax +; AVX512-NEXT: vpmovsxwd 8(%rdi), %ymm0 +; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX512-NEXT: vmovaps %ymm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = load %Arguments, %Arguments* %a0, align 1 + %2 = extractvalue %Arguments %1, 1 + %3 = extractvalue %Arguments %1, 2 + %4 = sitofp <8 x i16> %2 to <8 x float> + store <8 x float> %4, <8 x float>* %3, align 32 + ret void +} + +define <2 x double> @sitofp_i32_to_2f64(<2 x double> %a0, i32 %a1) nounwind { +; SSE-LABEL: sitofp_i32_to_2f64: +; SSE: # %bb.0: +; SSE-NEXT: cvtsi2sdl %edi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_i32_to_2f64: +; AVX: # %bb.0: +; AVX-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0 +; AVX-NEXT: retq + %cvt = sitofp i32 %a1 to double + %res = insertelement <2 x double> %a0, double %cvt, i32 0 + ret <2 x double> %res +} + +define <4 x float> @sitofp_i32_to_4f32(<4 x float> %a0, i32 %a1) nounwind { +; SSE-LABEL: sitofp_i32_to_4f32: +; SSE: # %bb.0: +; SSE-NEXT: cvtsi2ssl %edi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_i32_to_4f32: +; AVX: # %bb.0: +; AVX-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 +; AVX-NEXT: retq + %cvt = sitofp i32 %a1 to float + %res = insertelement <4 x float> %a0, float %cvt, i32 0 + ret <4 x float> %res +} + +define <2 x double> @sitofp_i64_to_2f64(<2 x double> %a0, i64 %a1) nounwind { +; SSE-LABEL: sitofp_i64_to_2f64: +; SSE: # %bb.0: +; SSE-NEXT: cvtsi2sdq %rdi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_i64_to_2f64: +; AVX: # %bb.0: +; AVX-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 +; AVX-NEXT: retq + %cvt = sitofp i64 %a1 to double + %res = insertelement <2 x double> %a0, double %cvt, i32 0 + ret <2 x double> %res +} + +define <4 x float> @sitofp_i64_to_4f32(<4 x float> %a0, i64 %a1) nounwind { +; SSE-LABEL: sitofp_i64_to_4f32: +; SSE: # %bb.0: +; SSE-NEXT: cvtsi2ssq %rdi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sitofp_i64_to_4f32: +; AVX: # %bb.0: +; AVX-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 +; AVX-NEXT: retq + %cvt = sitofp i64 %a1 to float + %res = insertelement <4 x float> %a0, float %cvt, i32 0 + ret <4 x float> %res +} diff --git a/llvm/test/CodeGen/X86/vector-sext-widen.ll b/llvm/test/CodeGen/X86/vector-sext-widen.ll new file mode 100644 index 000000000000..3986600d9ba2 --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-sext-widen.ll @@ -0,0 +1,5095 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW +; +; Just one 32-bit run to make sure we do reasonable things there. +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE41 + +define <8 x i16> @sext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_16i8_to_8i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_16i8_to_8i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: psraw $8, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_16i8_to_8i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sext_16i8_to_8i16: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X32-SSE41-LABEL: sext_16i8_to_8i16: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> + %C = sext <8 x i8> %B to <8 x i16> + ret <8 x i16> %C +} + +define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_16i8_to_16i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: psraw $8, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_16i8_to_16i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: psraw $8, %xmm0 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: psraw $8, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_16i8_to_16i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbw %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovsxbw %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_16i8_to_16i16: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_16i8_to_16i16: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sext_16i8_to_16i16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512-NEXT: retq +; +; X32-SSE41-LABEL: sext_16i8_to_16i16: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm1 +; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = sext <16 x i8> %A to <16 x i16> + ret <16 x i16> %B +} + +define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_32i8_to_32i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: psraw $8, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-NEXT: psraw $8, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: psraw $8, %xmm3 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_32i8_to_32i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: psraw $8, %xmm0 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: psraw $8, %xmm1 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSSE3-NEXT: psraw $8, %xmm2 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: psraw $8, %xmm3 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_32i8_to_32i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbw %xmm0, %xmm5 +; SSE41-NEXT: pmovsxbw %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovsxbw %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pmovsxbw %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_32i8_to_32i16: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_32i8_to_32i16: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxbw %xmm0, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1 +; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: sext_32i8_to_32i16: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm2 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: sext_32i8_to_32i16: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 +; AVX512BW-NEXT: retq +; +; X32-SSE41-LABEL: sext_32i8_to_32i16: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm5 +; X32-SSE41-NEXT: pmovsxbw %xmm1, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm4 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm3 +; X32-SSE41-NEXT: movdqa %xmm5, %xmm0 +; X32-SSE41-NEXT: movdqa %xmm4, %xmm1 +; X32-SSE41-NEXT: retl +entry: + %B = sext <32 x i8> %A to <32 x i16> + ret <32 x i16> %B +} + +define <4 x i32> @sext_16i8_to_4i32(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_16i8_to_4i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_16i8_to_4i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_16i8_to_4i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sext_16i8_to_4i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X32-SSE41-LABEL: sext_16i8_to_4i32: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> + %C = sext <4 x i8> %B to <4 x i32> + ret <4 x i32> %C +} + +define <8 x i32> @sext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_16i8_to_8i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: psrad $24, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_16i8_to_8i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: psrad $24, %xmm2 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: psrad $24, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_16i8_to_8i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovsxbd %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_16i8_to_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_16i8_to_8i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sext_16i8_to_8i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxbd %xmm0, %ymm0 +; AVX512-NEXT: retq +; +; X32-SSE41-LABEL: sext_16i8_to_8i32: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm1 +; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> + %C = sext <8 x i8> %B to <8 x i32> + ret <8 x i32> %C +} + +define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_16i8_to_16i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE2-NEXT: psrad $24, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: psrad $24, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: psrad $24, %xmm3 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_16i8_to_16i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSSE3-NEXT: psrad $24, %xmm4 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: psrad $24, %xmm1 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: psrad $24, %xmm2 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: psrad $24, %xmm3 +; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_16i8_to_16i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovsxbd %xmm1, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovsxbd %xmm2, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE41-NEXT: pmovsxbd %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_16i8_to_16i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_16i8_to_16i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm1 +; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sext_16i8_to_16i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512-NEXT: retq +; +; X32-SSE41-LABEL: sext_16i8_to_16i32: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm4 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X32-SSE41-NEXT: pmovsxbd %xmm1, %xmm1 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pmovsxbd %xmm2, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm3 +; X32-SSE41-NEXT: movdqa %xmm4, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = sext <16 x i8> %A to <16 x i32> + ret <16 x i32> %B +} + +define <2 x i64> @sext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_16i8_to_2i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_16i8_to_2i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: psrad $24, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_16i8_to_2i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbq %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sext_16i8_to_2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovsxbq %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X32-SSE41-LABEL: sext_16i8_to_2i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> + %C = sext <2 x i8> %B to <2 x i64> + ret <2 x i64> %C +} + +define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_16i8_to_4i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: psrad $24, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_16i8_to_4i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: psrad $24, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: psrad $24, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_16i8_to_4i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbq %xmm0, %xmm2 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pmovsxbq %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_16i8_to_4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxbq %xmm0, %xmm1 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_16i8_to_4i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxbq %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sext_16i8_to_4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxbq %xmm0, %ymm0 +; AVX512-NEXT: retq +; +; X32-SSE41-LABEL: sext_16i8_to_4i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm2 +; X32-SSE41-NEXT: psrld $16, %xmm0 +; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm1 +; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> + %C = sext <4 x i8> %B to <4 x i64> + ret <4 x i64> %C +} + +define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_16i8_to_8i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: psrad $24, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: psrad $24, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: psrad $24, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_16i8_to_8i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSSE3-NEXT: movdqa %xmm4, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: psrad $24, %xmm4 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: psrad $24, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: psrad $24, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: psrad $24, %xmm3 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_16i8_to_8i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbq %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pmovsxbq %xmm1, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovsxbq %xmm2, %xmm2 +; SSE41-NEXT: psrlq $48, %xmm0 +; SSE41-NEXT: pmovsxbq %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_16i8_to_8i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxbq %xmm0, %xmm1 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovsxbq %xmm0, %xmm1 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_16i8_to_8i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxbq %xmm0, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX2-NEXT: vpmovsxbq %xmm0, %ymm1 +; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sext_16i8_to_8i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxbq %xmm0, %zmm0 +; AVX512-NEXT: retq +; +; X32-SSE41-LABEL: sext_16i8_to_8i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm4 +; X32-SSE41-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE41-NEXT: psrld $16, %xmm1 +; X32-SSE41-NEXT: pmovsxbq %xmm1, %xmm1 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; X32-SSE41-NEXT: pmovsxbq %xmm2, %xmm2 +; X32-SSE41-NEXT: psrlq $48, %xmm0 +; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm3 +; X32-SSE41-NEXT: movdqa %xmm4, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> + %C = sext <8 x i8> %B to <8 x i64> + ret <8 x i64> %C +} + +define <4 x i32> @sext_8i16_to_4i32(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_8i16_to_4i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_8i16_to_4i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_8i16_to_4i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sext_8i16_to_4i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X32-SSE41-LABEL: sext_8i16_to_4i32: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> + %C = sext <4 x i16> %B to <4 x i32> + ret <4 x i32> %C +} + +define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_8i16_to_8i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_8i16_to_8i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: psrad $16, %xmm2 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: psrad $16, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_8i16_to_8i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxwd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_8i16_to_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_8i16_to_8i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sext_8i16_to_8i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512-NEXT: retq +; +; X32-SSE41-LABEL: sext_8i16_to_8i32: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm1 +; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = sext <8 x i16> %A to <8 x i32> + ret <8 x i32> %B +} + +define <16 x i32> @sext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_16i16_to_16i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_16i16_to_16i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSSE3-NEXT: psrad $16, %xmm4 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSSE3-NEXT: psrad $16, %xmm5 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSSE3-NEXT: psrad $16, %xmm2 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSSE3-NEXT: psrad $16, %xmm3 +; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_16i16_to_16i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxwd %xmm0, %xmm5 +; SSE41-NEXT: pmovsxwd %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovsxwd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pmovsxwd %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_16i16_to_16i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_16i16_to_16i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm1 +; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sext_16i16_to_16i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512-NEXT: retq +; +; X32-SSE41-LABEL: sext_16i16_to_16i32: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm5 +; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm4 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm3 +; X32-SSE41-NEXT: movdqa %xmm5, %xmm0 +; X32-SSE41-NEXT: movdqa %xmm4, %xmm1 +; X32-SSE41-NEXT: retl +entry: + %B = sext <16 x i16> %A to <16 x i32> + ret <16 x i32> %B +} + +define <2 x i64> @sext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_8i16_to_2i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_8i16_to_2i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_8i16_to_2i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxwq %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sext_8i16_to_2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovsxwq %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X32-SSE41-LABEL: sext_8i16_to_2i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = shufflevector <8 x i16> %A, <8 x i16> undef, <2 x i32> + %C = sext <2 x i16> %B to <2 x i64> + ret <2 x i64> %C +} + +define <4 x i64> @sext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_8i16_to_4i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_8i16_to_4i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: psrad $16, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: psrad $16, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_8i16_to_4i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxwq %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovsxwq %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_8i16_to_4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxwq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_8i16_to_4i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sext_8i16_to_4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxwq %xmm0, %ymm0 +; AVX512-NEXT: retq +; +; X32-SSE41-LABEL: sext_8i16_to_4i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm1 +; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> + %C = sext <4 x i16> %B to <4 x i64> + ret <4 x i64> %C +} + +define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_8i16_to_8i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_8i16_to_8i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSSE3-NEXT: movdqa %xmm4, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: psrad $16, %xmm4 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: psrad $16, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: psrad $31, %xmm3 +; SSSE3-NEXT: psrad $16, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: psrad $16, %xmm3 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_8i16_to_8i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxwq %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovsxwq %xmm1, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovsxwq %xmm2, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE41-NEXT: pmovsxwq %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_8i16_to_8i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxwq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwq %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] +; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_8i16_to_8i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxwq %xmm0, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmovsxwq %xmm0, %ymm1 +; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sext_8i16_to_8i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512-NEXT: retq +; +; X32-SSE41-LABEL: sext_8i16_to_8i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm4 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X32-SSE41-NEXT: pmovsxwq %xmm1, %xmm1 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pmovsxwq %xmm2, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm3 +; X32-SSE41-NEXT: movdqa %xmm4, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = sext <8 x i16> %A to <8 x i64> + ret <8 x i64> %B +} + +define <2 x i64> @sext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_4i32_to_2i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_4i32_to_2i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_4i32_to_2i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxdq %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sext_4i32_to_2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X32-SSE41-LABEL: sext_4i32_to_2i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> + %C = sext <2 x i32> %B to <2 x i64> + ret <2 x i64> %C +} + +define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_4i32_to_4i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_4i32_to_4i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_4i32_to_4i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxdq %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovsxdq %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_4i32_to_4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_4i32_to_4i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sext_4i32_to_4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX512-NEXT: retq +; +; X32-SSE41-LABEL: sext_4i32_to_4i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm1 +; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = sext <4 x i32> %A to <4 x i64> + ret <4 x i64> %B +} + +define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_8i32_to_8i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_8i32_to_8i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: psrad $31, %xmm3 +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: psrad $31, %xmm3 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_8i32_to_8i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxdq %xmm0, %xmm5 +; SSE41-NEXT: pmovsxdq %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovsxdq %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pmovsxdq %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_8i32_to_8i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_8i32_to_8i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm1 +; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sext_8i32_to_8i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0 +; AVX512-NEXT: retq +; +; X32-SSE41-LABEL: sext_8i32_to_8i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm5 +; X32-SSE41-NEXT: pmovsxdq %xmm1, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm4 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm3 +; X32-SSE41-NEXT: movdqa %xmm5, %xmm0 +; X32-SSE41-NEXT: movdqa %xmm4, %xmm1 +; X32-SSE41-NEXT: retl +entry: + %B = sext <8 x i32> %A to <8 x i64> + ret <8 x i64> %B +} + +define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) { +; SSE-LABEL: load_sext_2i1_to_2i64: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movzbl (%rdi), %eax +; SSE-NEXT: movq %rax, %rcx +; SSE-NEXT: shlq $62, %rcx +; SSE-NEXT: sarq $63, %rcx +; SSE-NEXT: movq %rcx, %xmm1 +; SSE-NEXT: shlq $63, %rax +; SSE-NEXT: sarq $63, %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX1-LABEL: load_sext_2i1_to_2i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: movzbl (%rdi), %eax +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $62, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: shlq $63, %rax +; AVX1-NEXT: sarq $63, %rax +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_2i1_to_2i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: movzbl (%rdi), %eax +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $62, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm0 +; AVX2-NEXT: shlq $63, %rax +; AVX2-NEXT: sarq $63, %rax +; AVX2-NEXT: vmovq %rax, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: load_sext_2i1_to_2i64: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: movzbl (%rdi), %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: load_sext_2i1_to_2i64: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: movzbl (%rdi), %eax +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_2i1_to_2i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: movzbl (%eax), %eax +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $31, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: movd %ecx, %xmm0 +; X32-SSE41-NEXT: pinsrd $1, %ecx, %xmm0 +; X32-SSE41-NEXT: shll $30, %eax +; X32-SSE41-NEXT: sarl $31, %eax +; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm0 +; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <2 x i1>, <2 x i1>* %ptr + %Y = sext <2 x i1> %X to <2 x i64> + ret <2 x i64> %Y +} + +define <2 x i64> @load_sext_2i8_to_2i64(<2 x i8> *%ptr) { +; SSE2-LABEL: load_sext_2i8_to_2i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movzwl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_2i8_to_2i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movzwl (%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: psrad $24, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_2i8_to_2i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbq (%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: load_sext_2i8_to_2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovsxbq (%rdi), %xmm0 +; AVX-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_2i8_to_2i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <2 x i8>, <2 x i8>* %ptr + %Y = sext <2 x i8> %X to <2 x i64> + ret <2 x i64> %Y +} + +define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) { +; SSE2-LABEL: load_sext_4i1_to_4i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movzbl (%rdi), %eax +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shlq $60, %rcx +; SSE2-NEXT: sarq $63, %rcx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shlq $61, %rcx +; SSE2-NEXT: sarq $63, %rcx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shlq $62, %rcx +; SSE2-NEXT: sarq $63, %rcx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: shlq $63, %rax +; SSE2-NEXT: sarq $63, %rax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_4i1_to_4i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movzbl (%rdi), %eax +; SSSE3-NEXT: movq %rax, %rcx +; SSSE3-NEXT: shlq $60, %rcx +; SSSE3-NEXT: sarq $63, %rcx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movq %rax, %rcx +; SSSE3-NEXT: shlq $61, %rcx +; SSSE3-NEXT: sarq $63, %rcx +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movq %rax, %rcx +; SSSE3-NEXT: shlq $62, %rcx +; SSSE3-NEXT: sarq $63, %rcx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: shlq $63, %rax +; SSSE3-NEXT: sarq $63, %rax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_4i1_to_4i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movzbl (%rdi), %eax +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $62, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: movq %rax, %rdx +; SSE41-NEXT: shlq $63, %rdx +; SSE41-NEXT: sarq $63, %rdx +; SSE41-NEXT: movd %edx, %xmm0 +; SSE41-NEXT: pinsrd $1, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $61, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 +; SSE41-NEXT: shlq $60, %rax +; SSE41-NEXT: sarq $63, %rax +; SSE41-NEXT: pinsrd $3, %eax, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_sext_4i1_to_4i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: movzbl (%rdi), %eax +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $62, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: movq %rax, %rdx +; AVX1-NEXT: shlq $63, %rdx +; AVX1-NEXT: sarq $63, %rdx +; AVX1-NEXT: vmovd %edx, %xmm0 +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $61, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: shlq $60, %rax +; AVX1-NEXT: sarq $63, %rax +; AVX1-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_4i1_to_4i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: movzbl (%rdi), %eax +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $62, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: movq %rax, %rdx +; AVX2-NEXT: shlq $63, %rdx +; AVX2-NEXT: sarq $63, %rdx +; AVX2-NEXT: vmovd %edx, %xmm0 +; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $61, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: shlq $60, %rax +; AVX2-NEXT: sarq $63, %rax +; AVX2-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: load_sext_4i1_to_4i32: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: movzbl (%rdi), %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: load_sext_4i1_to_4i32: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: movzbl (%rdi), %eax +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_4i1_to_4i32: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: movl (%eax), %eax +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $30, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: movl %eax, %edx +; X32-SSE41-NEXT: shll $31, %edx +; X32-SSE41-NEXT: sarl $31, %edx +; X32-SSE41-NEXT: movd %edx, %xmm0 +; X32-SSE41-NEXT: pinsrd $1, %ecx, %xmm0 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $29, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrd $2, %ecx, %xmm0 +; X32-SSE41-NEXT: shll $28, %eax +; X32-SSE41-NEXT: sarl $31, %eax +; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <4 x i1>, <4 x i1>* %ptr + %Y = sext <4 x i1> %X to <4 x i32> + ret <4 x i32> %Y +} + +define <4 x i32> @load_sext_4i8_to_4i32(<4 x i8> *%ptr) { +; SSE2-LABEL: load_sext_4i8_to_4i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_4i8_to_4i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_4i8_to_4i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbd (%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: load_sext_4i8_to_4i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovsxbd (%rdi), %xmm0 +; AVX-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_4i8_to_4i32: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxbd (%eax), %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <4 x i8>, <4 x i8>* %ptr + %Y = sext <4 x i8> %X to <4 x i32> + ret <4 x i32> %Y +} + +define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) { +; SSE2-LABEL: load_sext_4i1_to_4i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movl (%rdi), %eax +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $3, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $2, %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: shrl %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3] +; SSE2-NEXT: psllq $63, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3] +; SSE2-NEXT: psllq $63, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_4i1_to_4i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movl (%rdi), %eax +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrl $3, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrl $2, %ecx +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: shrl %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3] +; SSSE3-NEXT: psllq $63, %xmm0 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3] +; SSSE3-NEXT: psllq $63, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_4i1_to_4i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movl (%rdi), %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrl %ecx +; SSE41-NEXT: movd %eax, %xmm1 +; SSE41-NEXT: pinsrd $1, %ecx, %xmm1 +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrl $2, %ecx +; SSE41-NEXT: pinsrd $2, %ecx, %xmm1 +; SSE41-NEXT: shrl $3, %eax +; SSE41-NEXT: pinsrd $3, %eax, %xmm1 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: psllq $63, %xmm0 +; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; SSE41-NEXT: psllq $63, %xmm1 +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_sext_4i1_to_4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: movzbl (%rdi), %eax +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $62, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: movq %rax, %rdx +; AVX1-NEXT: shlq $63, %rdx +; AVX1-NEXT: sarq $63, %rdx +; AVX1-NEXT: vmovd %edx, %xmm0 +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $61, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: shlq $60, %rax +; AVX1-NEXT: sarq $63, %rax +; AVX1-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_4i1_to_4i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: movzbl (%rdi), %eax +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $60, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $61, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $62, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: shlq $63, %rax +; AVX2-NEXT: sarq $63, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: load_sext_4i1_to_4i64: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: movzbl (%rdi), %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: load_sext_4i1_to_4i64: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: movzbl (%rdi), %eax +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512BW-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_4i1_to_4i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: movzbl (%eax), %eax +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shrl %ecx +; X32-SSE41-NEXT: movd %eax, %xmm1 +; X32-SSE41-NEXT: pinsrd $1, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shrl $2, %ecx +; X32-SSE41-NEXT: pinsrd $2, %ecx, %xmm1 +; X32-SSE41-NEXT: shrl $3, %eax +; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm1 +; X32-SSE41-NEXT: pand {{\.LCPI.*}}, %xmm1 +; X32-SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; X32-SSE41-NEXT: psllq $63, %xmm0 +; X32-SSE41-NEXT: psrad $31, %xmm0 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; X32-SSE41-NEXT: psllq $63, %xmm1 +; X32-SSE41-NEXT: psrad $31, %xmm1 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X32-SSE41-NEXT: retl +entry: + %X = load <4 x i1>, <4 x i1>* %ptr + %Y = sext <4 x i1> %X to <4 x i64> + ret <4 x i64> %Y +} + +define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) { +; SSE2-LABEL: load_sext_4i8_to_4i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movsbq 1(%rdi), %rax +; SSE2-NEXT: movq %rax, %xmm1 +; SSE2-NEXT: movsbq (%rdi), %rax +; SSE2-NEXT: movq %rax, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movsbq 3(%rdi), %rax +; SSE2-NEXT: movq %rax, %xmm2 +; SSE2-NEXT: movsbq 2(%rdi), %rax +; SSE2-NEXT: movq %rax, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_4i8_to_4i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movsbq 1(%rdi), %rax +; SSSE3-NEXT: movq %rax, %xmm1 +; SSSE3-NEXT: movsbq (%rdi), %rax +; SSSE3-NEXT: movq %rax, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: movsbq 3(%rdi), %rax +; SSSE3-NEXT: movq %rax, %xmm2 +; SSSE3-NEXT: movsbq 2(%rdi), %rax +; SSSE3-NEXT: movq %rax, %xmm1 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_4i8_to_4i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbq (%rdi), %xmm0 +; SSE41-NEXT: pmovsxbq 2(%rdi), %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_sext_4i8_to_4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxbd (%rdi), %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_4i8_to_4i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxbq (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_sext_4i8_to_4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxbq (%rdi), %ymm0 +; AVX512-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_4i8_to_4i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0 +; X32-SSE41-NEXT: pmovsxbq 2(%eax), %xmm1 +; X32-SSE41-NEXT: retl +entry: + %X = load <4 x i8>, <4 x i8>* %ptr + %Y = sext <4 x i8> %X to <4 x i64> + ret <4 x i64> %Y +} + +define <2 x i64> @load_sext_4i8_to_4i64_extract(<4 x i8> *%ptr) { +; SSE2-LABEL: load_sext_4i8_to_4i64_extract: +; SSE2: # %bb.0: +; SSE2-NEXT: movsbq 3(%rdi), %rax +; SSE2-NEXT: movq %rax, %xmm1 +; SSE2-NEXT: movsbq 2(%rdi), %rax +; SSE2-NEXT: movq %rax, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_4i8_to_4i64_extract: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movsbq 3(%rdi), %rax +; SSSE3-NEXT: movq %rax, %xmm1 +; SSSE3-NEXT: movsbq 2(%rdi), %rax +; SSSE3-NEXT: movq %rax, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_4i8_to_4i64_extract: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbq 2(%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_sext_4i8_to_4i64_extract: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxbd (%rdi), %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_4i8_to_4i64_extract: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxbq (%rdi), %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_sext_4i8_to_4i64_extract: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxbq (%rdi), %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_4i8_to_4i64_extract: +; X32-SSE41: # %bb.0: +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxbq 2(%eax), %xmm0 +; X32-SSE41-NEXT: retl + %ld = load <4 x i8>, <4 x i8>* %ptr + %sext = sext <4 x i8> %ld to <4 x i64> + %extract = shufflevector <4 x i64> %sext, <4 x i64> undef, <2 x i32> + ret <2 x i64> %extract +} + +define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) { +; SSE2-LABEL: load_sext_8i1_to_8i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movsbq (%rdi), %rax +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq $7, %rcx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shlq $57, %rcx +; SSE2-NEXT: sarq $63, %rcx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shlq $58, %rcx +; SSE2-NEXT: sarq $63, %rcx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shlq $59, %rcx +; SSE2-NEXT: sarq $63, %rcx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shlq $60, %rcx +; SSE2-NEXT: sarq $63, %rcx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shlq $61, %rcx +; SSE2-NEXT: sarq $63, %rcx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shlq $62, %rcx +; SSE2-NEXT: sarq $63, %rcx +; SSE2-NEXT: movd %ecx, %xmm3 +; SSE2-NEXT: shlq $63, %rax +; SSE2-NEXT: sarq $63, %rax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_8i1_to_8i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movsbq (%rdi), %rax +; SSSE3-NEXT: movq %rax, %rcx +; SSSE3-NEXT: shrq $7, %rcx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movq %rax, %rcx +; SSSE3-NEXT: shlq $57, %rcx +; SSSE3-NEXT: sarq $63, %rcx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: movq %rax, %rcx +; SSSE3-NEXT: shlq $58, %rcx +; SSSE3-NEXT: sarq $63, %rcx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movq %rax, %rcx +; SSSE3-NEXT: shlq $59, %rcx +; SSSE3-NEXT: sarq $63, %rcx +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: movq %rax, %rcx +; SSSE3-NEXT: shlq $60, %rcx +; SSSE3-NEXT: sarq $63, %rcx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movq %rax, %rcx +; SSSE3-NEXT: shlq $61, %rcx +; SSSE3-NEXT: sarq $63, %rcx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: movq %rax, %rcx +; SSSE3-NEXT: shlq $62, %rcx +; SSSE3-NEXT: sarq $63, %rcx +; SSSE3-NEXT: movd %ecx, %xmm3 +; SSSE3-NEXT: shlq $63, %rax +; SSSE3-NEXT: sarq $63, %rax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_8i1_to_8i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movsbq (%rdi), %rax +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $62, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: movq %rax, %rdx +; SSE41-NEXT: shlq $63, %rdx +; SSE41-NEXT: sarq $63, %rdx +; SSE41-NEXT: movd %edx, %xmm0 +; SSE41-NEXT: pinsrw $1, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $61, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrw $2, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $60, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrw $3, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $59, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $58, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrw $5, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $57, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrw $6, %ecx, %xmm0 +; SSE41-NEXT: shrq $7, %rax +; SSE41-NEXT: pinsrw $7, %eax, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_sext_8i1_to_8i16: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: movsbq (%rdi), %rax +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $62, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: movq %rax, %rdx +; AVX1-NEXT: shlq $63, %rdx +; AVX1-NEXT: sarq $63, %rdx +; AVX1-NEXT: vmovd %edx, %xmm0 +; AVX1-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $61, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $60, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $59, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $58, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $57, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: shrq $7, %rax +; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_8i1_to_8i16: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: movsbq (%rdi), %rax +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $62, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: movq %rax, %rdx +; AVX2-NEXT: shlq $63, %rdx +; AVX2-NEXT: sarq $63, %rdx +; AVX2-NEXT: vmovd %edx, %xmm0 +; AVX2-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $61, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $60, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $59, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $58, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $57, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: shrq $7, %rax +; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: load_sext_8i1_to_8i16: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: movzbl (%rdi), %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: load_sext_8i1_to_8i16: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: movzbl (%rdi), %eax +; AVX512BW-NEXT: kmovd %eax, %k0 +; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_8i1_to_8i16: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: movsbl (%eax), %eax +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $30, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: movl %eax, %edx +; X32-SSE41-NEXT: shll $31, %edx +; X32-SSE41-NEXT: sarl $31, %edx +; X32-SSE41-NEXT: movd %edx, %xmm0 +; X32-SSE41-NEXT: pinsrw $1, %ecx, %xmm0 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $29, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrw $2, %ecx, %xmm0 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $28, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrw $3, %ecx, %xmm0 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $27, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrw $4, %ecx, %xmm0 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $26, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrw $5, %ecx, %xmm0 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $25, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrw $6, %ecx, %xmm0 +; X32-SSE41-NEXT: shrl $7, %eax +; X32-SSE41-NEXT: pinsrw $7, %eax, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <8 x i1>, <8 x i1>* %ptr + %Y = sext <8 x i1> %X to <8 x i16> + ret <8 x i16> %Y +} + +define <8 x i16> @load_sext_8i8_to_8i16(<8 x i8> *%ptr) { +; SSE2-LABEL: load_sext_8i8_to_8i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_8i8_to_8i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: psraw $8, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_8i8_to_8i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbw (%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: load_sext_8i8_to_8i16: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovsxbw (%rdi), %xmm0 +; AVX-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_8i8_to_8i16: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxbw (%eax), %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <8 x i8>, <8 x i8>* %ptr + %Y = sext <8 x i8> %X to <8 x i16> + ret <8 x i16> %Y +} + +define <8 x i64> @load_sext_8i8_to_8i64(<8 x i8> *%ptr) { +; SSE2-LABEL: load_sext_8i8_to_8i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movsbq 1(%rdi), %rax +; SSE2-NEXT: movq %rax, %xmm1 +; SSE2-NEXT: movsbq (%rdi), %rax +; SSE2-NEXT: movq %rax, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movsbq 3(%rdi), %rax +; SSE2-NEXT: movq %rax, %xmm2 +; SSE2-NEXT: movsbq 2(%rdi), %rax +; SSE2-NEXT: movq %rax, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: movsbq 5(%rdi), %rax +; SSE2-NEXT: movq %rax, %xmm3 +; SSE2-NEXT: movsbq 4(%rdi), %rax +; SSE2-NEXT: movq %rax, %xmm2 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE2-NEXT: movsbq 7(%rdi), %rax +; SSE2-NEXT: movq %rax, %xmm4 +; SSE2-NEXT: movsbq 6(%rdi), %rax +; SSE2-NEXT: movq %rax, %xmm3 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_8i8_to_8i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movsbq 1(%rdi), %rax +; SSSE3-NEXT: movq %rax, %xmm1 +; SSSE3-NEXT: movsbq (%rdi), %rax +; SSSE3-NEXT: movq %rax, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: movsbq 3(%rdi), %rax +; SSSE3-NEXT: movq %rax, %xmm2 +; SSSE3-NEXT: movsbq 2(%rdi), %rax +; SSSE3-NEXT: movq %rax, %xmm1 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: movsbq 5(%rdi), %rax +; SSSE3-NEXT: movq %rax, %xmm3 +; SSSE3-NEXT: movsbq 4(%rdi), %rax +; SSSE3-NEXT: movq %rax, %xmm2 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSSE3-NEXT: movsbq 7(%rdi), %rax +; SSSE3-NEXT: movq %rax, %xmm4 +; SSSE3-NEXT: movsbq 6(%rdi), %rax +; SSSE3-NEXT: movq %rax, %xmm3 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_8i8_to_8i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbq (%rdi), %xmm0 +; SSE41-NEXT: pmovsxbq 2(%rdi), %xmm1 +; SSE41-NEXT: pmovsxbq 4(%rdi), %xmm2 +; SSE41-NEXT: pmovsxbq 6(%rdi), %xmm3 +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_sext_8i8_to_8i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxbd (%rdi), %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpmovsxbd 4(%rdi), %xmm1 +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_8i8_to_8i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxbq (%rdi), %ymm0 +; AVX2-NEXT: vpmovsxbq 4(%rdi), %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_sext_8i8_to_8i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxbq (%rdi), %zmm0 +; AVX512-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_8i8_to_8i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0 +; X32-SSE41-NEXT: pmovsxbq 2(%eax), %xmm1 +; X32-SSE41-NEXT: pmovsxbq 4(%eax), %xmm2 +; X32-SSE41-NEXT: pmovsxbq 6(%eax), %xmm3 +; X32-SSE41-NEXT: retl +entry: + %X = load <8 x i8>, <8 x i8>* %ptr + %Y = sext <8 x i8> %X to <8 x i64> + ret <8 x i64> %Y +} + +define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) { +; SSE2-LABEL: load_sext_8i1_to_8i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movzbl (%rdi), %eax +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $7, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $6, %ecx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $5, %ecx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $4, %ecx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $3, %ecx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $2, %ecx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: shrl %eax +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: pslld $31, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_8i1_to_8i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movzbl (%rdi), %eax +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrl $7, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrl $6, %ecx +; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrl $5, %ecx +; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrl $4, %ecx +; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrl $3, %ecx +; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrl $2, %ecx +; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: shrl %eax +; SSSE3-NEXT: andl $1, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm0 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: pslld $31, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_8i1_to_8i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movzbl (%rdi), %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrl %ecx +; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: andl $1, %edx +; SSE41-NEXT: movd %edx, %xmm1 +; SSE41-NEXT: pinsrw $1, %ecx, %xmm1 +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrl $2, %ecx +; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: pinsrw $2, %ecx, %xmm1 +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrl $3, %ecx +; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: pinsrw $3, %ecx, %xmm1 +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrl $4, %ecx +; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: pinsrw $4, %ecx, %xmm1 +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrl $5, %ecx +; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: pinsrw $5, %ecx, %xmm1 +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrl $6, %ecx +; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: pinsrw $6, %ecx, %xmm1 +; SSE41-NEXT: shrl $7, %eax +; SSE41-NEXT: pinsrw $7, %eax, %xmm1 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: pslld $31, %xmm0 +; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE41-NEXT: pslld $31, %xmm1 +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_sext_8i1_to_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: movsbq (%rdi), %rax +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $58, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: movq %rax, %rdx +; AVX1-NEXT: shlq $59, %rdx +; AVX1-NEXT: sarq $63, %rdx +; AVX1-NEXT: vmovd %edx, %xmm0 +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $57, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq $7, %rcx +; AVX1-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $62, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: movq %rax, %rdx +; AVX1-NEXT: shlq $63, %rdx +; AVX1-NEXT: sarq $63, %rdx +; AVX1-NEXT: vmovd %edx, %xmm1 +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $61, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: shlq $60, %rax +; AVX1-NEXT: sarq $63, %rax +; AVX1-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_8i1_to_8i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: movsbq (%rdi), %rax +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $58, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: movq %rax, %rdx +; AVX2-NEXT: shlq $59, %rdx +; AVX2-NEXT: sarq $63, %rdx +; AVX2-NEXT: vmovd %edx, %xmm0 +; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $57, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq $7, %rcx +; AVX2-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $62, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: movq %rax, %rdx +; AVX2-NEXT: shlq $63, %rdx +; AVX2-NEXT: sarq $63, %rdx +; AVX2-NEXT: vmovd %edx, %xmm1 +; AVX2-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $61, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: shlq $60, %rax +; AVX2-NEXT: sarq $63, %rax +; AVX2-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: load_sext_8i1_to_8i32: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: movzbl (%rdi), %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: load_sext_8i1_to_8i32: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: movzbl (%rdi), %eax +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512BW-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_8i1_to_8i32: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: movzbl (%eax), %eax +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shrl %ecx +; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: movl %eax, %edx +; X32-SSE41-NEXT: andl $1, %edx +; X32-SSE41-NEXT: movd %edx, %xmm1 +; X32-SSE41-NEXT: pinsrw $1, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shrl $2, %ecx +; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: pinsrw $2, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shrl $3, %ecx +; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: pinsrw $3, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shrl $4, %ecx +; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: pinsrw $4, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shrl $5, %ecx +; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: pinsrw $5, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shrl $6, %ecx +; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: pinsrw $6, %ecx, %xmm1 +; X32-SSE41-NEXT: shrl $7, %eax +; X32-SSE41-NEXT: pinsrw $7, %eax, %xmm1 +; X32-SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X32-SSE41-NEXT: pslld $31, %xmm0 +; X32-SSE41-NEXT: psrad $31, %xmm0 +; X32-SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X32-SSE41-NEXT: pslld $31, %xmm1 +; X32-SSE41-NEXT: psrad $31, %xmm1 +; X32-SSE41-NEXT: retl +entry: + %X = load <8 x i1>, <8 x i1>* %ptr + %Y = sext <8 x i1> %X to <8 x i32> + ret <8 x i32> %Y +} + +define <8 x i32> @load_sext_8i8_to_8i32(<8 x i8> *%ptr) { +; SSE2-LABEL: load_sext_8i8_to_8i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_8i8_to_8i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm0 +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_8i8_to_8i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbd (%rdi), %xmm0 +; SSE41-NEXT: pmovsxbd 4(%rdi), %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_sext_8i8_to_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxbw (%rdi), %xmm0 +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_8i8_to_8i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxbd (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_sext_8i8_to_8i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxbd (%rdi), %ymm0 +; AVX512-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_8i8_to_8i32: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxbd (%eax), %xmm0 +; X32-SSE41-NEXT: pmovsxbd 4(%eax), %xmm1 +; X32-SSE41-NEXT: retl +entry: + %X = load <8 x i8>, <8 x i8>* %ptr + %Y = sext <8 x i8> %X to <8 x i32> + ret <8 x i32> %Y +} + +define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone { +; SSE2-LABEL: load_sext_16i1_to_16i8: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: movswq (%rdi), %rax +; SSE2-NEXT: movq %rax, %r8 +; SSE2-NEXT: movq %rax, %r9 +; SSE2-NEXT: movq %rax, %r10 +; SSE2-NEXT: movq %rax, %r11 +; SSE2-NEXT: movq %rax, %r14 +; SSE2-NEXT: movq %rax, %r15 +; SSE2-NEXT: movq %rax, %r12 +; SSE2-NEXT: movq %rax, %r13 +; SSE2-NEXT: movq %rax, %rbx +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: movq %rax, %rdx +; SSE2-NEXT: movq %rax, %rsi +; SSE2-NEXT: movq %rax, %rdi +; SSE2-NEXT: movq %rax, %rbp +; SSE2-NEXT: shrq $15, %rbp +; SSE2-NEXT: movd %ebp, %xmm0 +; SSE2-NEXT: movq %rax, %rbp +; SSE2-NEXT: movsbq %al, %rax +; SSE2-NEXT: shlq $49, %r8 +; SSE2-NEXT: sarq $63, %r8 +; SSE2-NEXT: movd %r8d, %xmm1 +; SSE2-NEXT: shlq $50, %r9 +; SSE2-NEXT: sarq $63, %r9 +; SSE2-NEXT: movd %r9d, %xmm2 +; SSE2-NEXT: shlq $51, %r10 +; SSE2-NEXT: sarq $63, %r10 +; SSE2-NEXT: movd %r10d, %xmm3 +; SSE2-NEXT: shlq $52, %r11 +; SSE2-NEXT: sarq $63, %r11 +; SSE2-NEXT: movd %r11d, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: shlq $53, %r14 +; SSE2-NEXT: sarq $63, %r14 +; SSE2-NEXT: movd %r14d, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: shlq $54, %r15 +; SSE2-NEXT: sarq $63, %r15 +; SSE2-NEXT: movd %r15d, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-NEXT: shlq $55, %r12 +; SSE2-NEXT: sarq $63, %r12 +; SSE2-NEXT: movd %r12d, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: shlq $60, %r13 +; SSE2-NEXT: sarq $63, %r13 +; SSE2-NEXT: movd %r13d, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: shlq $61, %rbx +; SSE2-NEXT: sarq $63, %rbx +; SSE2-NEXT: movd %ebx, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: shlq $62, %rcx +; SSE2-NEXT: sarq $63, %rcx +; SSE2-NEXT: movd %ecx, %xmm5 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-NEXT: shlq $63, %rdx +; SSE2-NEXT: sarq $63, %rdx +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: shlq $58, %rsi +; SSE2-NEXT: sarq $63, %rsi +; SSE2-NEXT: movd %esi, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE2-NEXT: shlq $59, %rdi +; SSE2-NEXT: sarq $63, %rdi +; SSE2-NEXT: movd %edi, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE2-NEXT: shlq $57, %rbp +; SSE2-NEXT: sarq $63, %rbp +; SSE2-NEXT: movd %ebp, %xmm2 +; SSE2-NEXT: shrq $7, %rax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_16i1_to_16i8: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: movswq (%rdi), %rax +; SSSE3-NEXT: movq %rax, %r8 +; SSSE3-NEXT: movq %rax, %r9 +; SSSE3-NEXT: movq %rax, %r10 +; SSSE3-NEXT: movq %rax, %r11 +; SSSE3-NEXT: movq %rax, %r14 +; SSSE3-NEXT: movq %rax, %r15 +; SSSE3-NEXT: movq %rax, %r12 +; SSSE3-NEXT: movq %rax, %r13 +; SSSE3-NEXT: movq %rax, %rbx +; SSSE3-NEXT: movq %rax, %rcx +; SSSE3-NEXT: movq %rax, %rdx +; SSSE3-NEXT: movq %rax, %rsi +; SSSE3-NEXT: movq %rax, %rdi +; SSSE3-NEXT: movq %rax, %rbp +; SSSE3-NEXT: shrq $15, %rbp +; SSSE3-NEXT: movd %ebp, %xmm0 +; SSSE3-NEXT: movq %rax, %rbp +; SSSE3-NEXT: movsbq %al, %rax +; SSSE3-NEXT: shlq $49, %r8 +; SSSE3-NEXT: sarq $63, %r8 +; SSSE3-NEXT: movd %r8d, %xmm1 +; SSSE3-NEXT: shlq $50, %r9 +; SSSE3-NEXT: sarq $63, %r9 +; SSSE3-NEXT: movd %r9d, %xmm2 +; SSSE3-NEXT: shlq $51, %r10 +; SSSE3-NEXT: sarq $63, %r10 +; SSSE3-NEXT: movd %r10d, %xmm3 +; SSSE3-NEXT: shlq $52, %r11 +; SSSE3-NEXT: sarq $63, %r11 +; SSSE3-NEXT: movd %r11d, %xmm4 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: shlq $53, %r14 +; SSSE3-NEXT: sarq $63, %r14 +; SSSE3-NEXT: movd %r14d, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSSE3-NEXT: shlq $54, %r15 +; SSSE3-NEXT: sarq $63, %r15 +; SSSE3-NEXT: movd %r15d, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSSE3-NEXT: shlq $55, %r12 +; SSSE3-NEXT: sarq $63, %r12 +; SSSE3-NEXT: movd %r12d, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSSE3-NEXT: shlq $60, %r13 +; SSSE3-NEXT: sarq $63, %r13 +; SSSE3-NEXT: movd %r13d, %xmm4 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: shlq $61, %rbx +; SSSE3-NEXT: sarq $63, %rbx +; SSSE3-NEXT: movd %ebx, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: shlq $62, %rcx +; SSSE3-NEXT: sarq $63, %rcx +; SSSE3-NEXT: movd %ecx, %xmm5 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSSE3-NEXT: shlq $63, %rdx +; SSSE3-NEXT: sarq $63, %rdx +; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSSE3-NEXT: shlq $58, %rsi +; SSSE3-NEXT: sarq $63, %rsi +; SSSE3-NEXT: movd %esi, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSSE3-NEXT: shlq $59, %rdi +; SSSE3-NEXT: sarq $63, %rdi +; SSSE3-NEXT: movd %edi, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSSE3-NEXT: shlq $57, %rbp +; SSSE3-NEXT: sarq $63, %rbp +; SSSE3-NEXT: movd %ebp, %xmm2 +; SSSE3-NEXT: shrq $7, %rax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_16i1_to_16i8: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movswq (%rdi), %rax +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $62, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: movq %rax, %rdx +; SSE41-NEXT: shlq $63, %rdx +; SSE41-NEXT: sarq $63, %rdx +; SSE41-NEXT: movd %edx, %xmm0 +; SSE41-NEXT: pinsrb $1, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $61, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $2, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $60, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $3, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $59, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $4, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $58, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $5, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $57, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $6, %ecx, %xmm0 +; SSE41-NEXT: movsbq %al, %rcx +; SSE41-NEXT: shrq $7, %rcx +; SSE41-NEXT: pinsrb $7, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $55, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $8, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $54, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $9, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $53, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $10, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $52, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $11, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $51, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $12, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $50, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $13, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $49, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $14, %ecx, %xmm0 +; SSE41-NEXT: shrq $15, %rax +; SSE41-NEXT: pinsrb $15, %eax, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_sext_16i1_to_16i8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: movswq (%rdi), %rax +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $62, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: movq %rax, %rdx +; AVX1-NEXT: shlq $63, %rdx +; AVX1-NEXT: sarq $63, %rdx +; AVX1-NEXT: vmovd %edx, %xmm0 +; AVX1-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $61, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $60, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $59, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $58, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $57, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movsbq %al, %rcx +; AVX1-NEXT: shrq $7, %rcx +; AVX1-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $55, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $54, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $53, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $52, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $51, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $50, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $49, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: shrq $15, %rax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_16i1_to_16i8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: movswq (%rdi), %rax +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $62, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: movq %rax, %rdx +; AVX2-NEXT: shlq $63, %rdx +; AVX2-NEXT: sarq $63, %rdx +; AVX2-NEXT: vmovd %edx, %xmm0 +; AVX2-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $61, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $60, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $59, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $58, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $57, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movsbq %al, %rcx +; AVX2-NEXT: shrq $7, %rcx +; AVX2-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $55, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $54, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $53, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $52, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $51, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $50, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $49, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: shrq $15, %rax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: load_sext_16i1_to_16i8: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: kmovw (%rdi), %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: load_sext_16i1_to_16i8: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: kmovw (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_16i1_to_16i8: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: movswl (%eax), %eax +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $30, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: movl %eax, %edx +; X32-SSE41-NEXT: shll $31, %edx +; X32-SSE41-NEXT: sarl $31, %edx +; X32-SSE41-NEXT: movd %edx, %xmm0 +; X32-SSE41-NEXT: pinsrb $1, %ecx, %xmm0 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $29, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrb $2, %ecx, %xmm0 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $28, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrb $3, %ecx, %xmm0 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $27, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrb $4, %ecx, %xmm0 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $26, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrb $5, %ecx, %xmm0 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $25, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrb $6, %ecx, %xmm0 +; X32-SSE41-NEXT: movsbl %al, %ecx +; X32-SSE41-NEXT: shrl $7, %ecx +; X32-SSE41-NEXT: pinsrb $7, %ecx, %xmm0 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $23, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrb $8, %ecx, %xmm0 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $22, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrb $9, %ecx, %xmm0 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $21, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrb $10, %ecx, %xmm0 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $20, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrb $11, %ecx, %xmm0 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $19, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrb $12, %ecx, %xmm0 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $18, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrb $13, %ecx, %xmm0 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $17, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrb $14, %ecx, %xmm0 +; X32-SSE41-NEXT: shrl $15, %eax +; X32-SSE41-NEXT: pinsrb $15, %eax, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <16 x i1>, <16 x i1>* %ptr + %Y = sext <16 x i1> %X to <16 x i8> + ret <16 x i8> %Y +} + +define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) { +; SSE2-LABEL: load_sext_16i1_to_16i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movzwl (%rdi), %eax +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $15, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $14, %ecx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $13, %ecx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $12, %ecx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $11, %ecx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $10, %ecx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: movd %ecx, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $9, %ecx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $8, %ecx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $7, %ecx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $6, %ecx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $5, %ecx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $4, %ecx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: movd %ecx, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $3, %ecx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: shrl $2, %ecx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movl %eax, %ecx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: shrl %eax +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psllw $15, %xmm0 +; SSE2-NEXT: psraw $15, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: psllw $15, %xmm1 +; SSE2-NEXT: psraw $15, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_16i1_to_16i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movzwl (%rdi), %eax +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrl $15, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrl $14, %ecx +; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrl $13, %ecx +; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrl $12, %ecx +; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrl $11, %ecx +; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrl $10, %ecx +; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrl $9, %ecx +; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrl $8, %ecx +; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrl $7, %ecx +; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrl $6, %ecx +; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrl $5, %ecx +; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrl $4, %ecx +; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrl $3, %ecx +; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: shrl $2, %ecx +; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: movl %eax, %ecx +; SSSE3-NEXT: andl $1, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: shrl %eax +; SSSE3-NEXT: andl $1, %eax +; SSSE3-NEXT: movd %eax, %xmm4 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: psllw $15, %xmm0 +; SSSE3-NEXT: psraw $15, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: psllw $15, %xmm1 +; SSSE3-NEXT: psraw $15, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_16i1_to_16i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movzwl (%rdi), %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrl %ecx +; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: andl $1, %edx +; SSE41-NEXT: movd %edx, %xmm1 +; SSE41-NEXT: pinsrb $1, %ecx, %xmm1 +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrl $2, %ecx +; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: pinsrb $2, %ecx, %xmm1 +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrl $3, %ecx +; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: pinsrb $3, %ecx, %xmm1 +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrl $4, %ecx +; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: pinsrb $4, %ecx, %xmm1 +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrl $5, %ecx +; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: pinsrb $5, %ecx, %xmm1 +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrl $6, %ecx +; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: pinsrb $6, %ecx, %xmm1 +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrl $7, %ecx +; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: pinsrb $7, %ecx, %xmm1 +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: pinsrb $8, %ecx, %xmm1 +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrl $9, %ecx +; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: pinsrb $9, %ecx, %xmm1 +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrl $10, %ecx +; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: pinsrb $10, %ecx, %xmm1 +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrl $11, %ecx +; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: pinsrb $11, %ecx, %xmm1 +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrl $12, %ecx +; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: pinsrb $12, %ecx, %xmm1 +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrl $13, %ecx +; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: pinsrb $13, %ecx, %xmm1 +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrl $14, %ecx +; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: pinsrb $14, %ecx, %xmm1 +; SSE41-NEXT: shrl $15, %eax +; SSE41-NEXT: pinsrb $15, %eax, %xmm1 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: psllw $15, %xmm0 +; SSE41-NEXT: psraw $15, %xmm0 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: psllw $15, %xmm1 +; SSE41-NEXT: psraw $15, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_sext_16i1_to_16i16: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: pushq %r15 +; AVX1-NEXT: .cfi_def_cfa_offset 24 +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: .cfi_def_cfa_offset 32 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: .cfi_def_cfa_offset 40 +; AVX1-NEXT: pushq %r12 +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: .cfi_def_cfa_offset 56 +; AVX1-NEXT: .cfi_offset %rbx, -56 +; AVX1-NEXT: .cfi_offset %r12, -48 +; AVX1-NEXT: .cfi_offset %r13, -40 +; AVX1-NEXT: .cfi_offset %r14, -32 +; AVX1-NEXT: .cfi_offset %r15, -24 +; AVX1-NEXT: .cfi_offset %rbp, -16 +; AVX1-NEXT: movswq (%rdi), %rax +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $55, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vmovd %ecx, %xmm0 +; AVX1-NEXT: movq %rax, %r8 +; AVX1-NEXT: movq %rax, %r10 +; AVX1-NEXT: movq %rax, %r11 +; AVX1-NEXT: movq %rax, %r14 +; AVX1-NEXT: movq %rax, %r15 +; AVX1-NEXT: movq %rax, %r9 +; AVX1-NEXT: movq %rax, %r12 +; AVX1-NEXT: movq %rax, %r13 +; AVX1-NEXT: movq %rax, %rbx +; AVX1-NEXT: movq %rax, %rdi +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: movq %rax, %rdx +; AVX1-NEXT: movq %rax, %rsi +; AVX1-NEXT: movsbq %al, %rbp +; AVX1-NEXT: shlq $54, %rax +; AVX1-NEXT: sarq $63, %rax +; AVX1-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; AVX1-NEXT: shlq $53, %r8 +; AVX1-NEXT: sarq $63, %r8 +; AVX1-NEXT: vpinsrw $2, %r8d, %xmm0, %xmm0 +; AVX1-NEXT: shlq $52, %r10 +; AVX1-NEXT: sarq $63, %r10 +; AVX1-NEXT: vpinsrw $3, %r10d, %xmm0, %xmm0 +; AVX1-NEXT: shlq $51, %r11 +; AVX1-NEXT: sarq $63, %r11 +; AVX1-NEXT: vpinsrw $4, %r11d, %xmm0, %xmm0 +; AVX1-NEXT: shlq $50, %r14 +; AVX1-NEXT: sarq $63, %r14 +; AVX1-NEXT: vpinsrw $5, %r14d, %xmm0, %xmm0 +; AVX1-NEXT: shlq $49, %r15 +; AVX1-NEXT: sarq $63, %r15 +; AVX1-NEXT: vpinsrw $6, %r15d, %xmm0, %xmm0 +; AVX1-NEXT: shrq $15, %r9 +; AVX1-NEXT: vpinsrw $7, %r9d, %xmm0, %xmm0 +; AVX1-NEXT: shlq $63, %r13 +; AVX1-NEXT: sarq $63, %r13 +; AVX1-NEXT: vmovd %r13d, %xmm1 +; AVX1-NEXT: shlq $62, %r12 +; AVX1-NEXT: sarq $63, %r12 +; AVX1-NEXT: vpinsrw $1, %r12d, %xmm1, %xmm1 +; AVX1-NEXT: shlq $61, %rbx +; AVX1-NEXT: sarq $63, %rbx +; AVX1-NEXT: vpinsrw $2, %ebx, %xmm1, %xmm1 +; AVX1-NEXT: shlq $60, %rdi +; AVX1-NEXT: sarq $63, %rdi +; AVX1-NEXT: vpinsrw $3, %edi, %xmm1, %xmm1 +; AVX1-NEXT: shlq $59, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: shlq $58, %rdx +; AVX1-NEXT: sarq $63, %rdx +; AVX1-NEXT: vpinsrw $5, %edx, %xmm1, %xmm1 +; AVX1-NEXT: shlq $57, %rsi +; AVX1-NEXT: sarq $63, %rsi +; AVX1-NEXT: vpinsrw $6, %esi, %xmm1, %xmm1 +; AVX1-NEXT: shrq $7, %rbp +; AVX1-NEXT: vpinsrw $7, %ebp, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: .cfi_def_cfa_offset 40 +; AVX1-NEXT: popq %r13 +; AVX1-NEXT: .cfi_def_cfa_offset 32 +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: .cfi_def_cfa_offset 24 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_16i1_to_16i16: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: .cfi_def_cfa_offset 16 +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: .cfi_def_cfa_offset 24 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: .cfi_def_cfa_offset 32 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: .cfi_def_cfa_offset 40 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: .cfi_def_cfa_offset 48 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: .cfi_def_cfa_offset 56 +; AVX2-NEXT: .cfi_offset %rbx, -56 +; AVX2-NEXT: .cfi_offset %r12, -48 +; AVX2-NEXT: .cfi_offset %r13, -40 +; AVX2-NEXT: .cfi_offset %r14, -32 +; AVX2-NEXT: .cfi_offset %r15, -24 +; AVX2-NEXT: .cfi_offset %rbp, -16 +; AVX2-NEXT: movswq (%rdi), %rax +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $55, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vmovd %ecx, %xmm0 +; AVX2-NEXT: movq %rax, %r8 +; AVX2-NEXT: movq %rax, %r10 +; AVX2-NEXT: movq %rax, %r11 +; AVX2-NEXT: movq %rax, %r14 +; AVX2-NEXT: movq %rax, %r15 +; AVX2-NEXT: movq %rax, %r9 +; AVX2-NEXT: movq %rax, %r12 +; AVX2-NEXT: movq %rax, %r13 +; AVX2-NEXT: movq %rax, %rbx +; AVX2-NEXT: movq %rax, %rdi +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: movq %rax, %rdx +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: movsbq %al, %rbp +; AVX2-NEXT: shlq $54, %rax +; AVX2-NEXT: sarq $63, %rax +; AVX2-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; AVX2-NEXT: shlq $53, %r8 +; AVX2-NEXT: sarq $63, %r8 +; AVX2-NEXT: vpinsrw $2, %r8d, %xmm0, %xmm0 +; AVX2-NEXT: shlq $52, %r10 +; AVX2-NEXT: sarq $63, %r10 +; AVX2-NEXT: vpinsrw $3, %r10d, %xmm0, %xmm0 +; AVX2-NEXT: shlq $51, %r11 +; AVX2-NEXT: sarq $63, %r11 +; AVX2-NEXT: vpinsrw $4, %r11d, %xmm0, %xmm0 +; AVX2-NEXT: shlq $50, %r14 +; AVX2-NEXT: sarq $63, %r14 +; AVX2-NEXT: vpinsrw $5, %r14d, %xmm0, %xmm0 +; AVX2-NEXT: shlq $49, %r15 +; AVX2-NEXT: sarq $63, %r15 +; AVX2-NEXT: vpinsrw $6, %r15d, %xmm0, %xmm0 +; AVX2-NEXT: shrq $15, %r9 +; AVX2-NEXT: vpinsrw $7, %r9d, %xmm0, %xmm0 +; AVX2-NEXT: shlq $63, %r13 +; AVX2-NEXT: sarq $63, %r13 +; AVX2-NEXT: vmovd %r13d, %xmm1 +; AVX2-NEXT: shlq $62, %r12 +; AVX2-NEXT: sarq $63, %r12 +; AVX2-NEXT: vpinsrw $1, %r12d, %xmm1, %xmm1 +; AVX2-NEXT: shlq $61, %rbx +; AVX2-NEXT: sarq $63, %rbx +; AVX2-NEXT: vpinsrw $2, %ebx, %xmm1, %xmm1 +; AVX2-NEXT: shlq $60, %rdi +; AVX2-NEXT: sarq $63, %rdi +; AVX2-NEXT: vpinsrw $3, %edi, %xmm1, %xmm1 +; AVX2-NEXT: shlq $59, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: shlq $58, %rdx +; AVX2-NEXT: sarq $63, %rdx +; AVX2-NEXT: vpinsrw $5, %edx, %xmm1, %xmm1 +; AVX2-NEXT: shlq $57, %rsi +; AVX2-NEXT: sarq $63, %rsi +; AVX2-NEXT: vpinsrw $6, %esi, %xmm1, %xmm1 +; AVX2-NEXT: shrq $7, %rbp +; AVX2-NEXT: vpinsrw $7, %ebp, %xmm1, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: .cfi_def_cfa_offset 48 +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: .cfi_def_cfa_offset 40 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: .cfi_def_cfa_offset 32 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: .cfi_def_cfa_offset 24 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: .cfi_def_cfa_offset 16 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: .cfi_def_cfa_offset 8 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: load_sext_16i1_to_16i16: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: kmovw (%rdi), %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: load_sext_16i1_to_16i16: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: kmovw (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512BW-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_16i1_to_16i16: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: movzwl (%eax), %eax +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shrl %ecx +; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: movl %eax, %edx +; X32-SSE41-NEXT: andl $1, %edx +; X32-SSE41-NEXT: movd %edx, %xmm1 +; X32-SSE41-NEXT: pinsrb $1, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shrl $2, %ecx +; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: pinsrb $2, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shrl $3, %ecx +; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: pinsrb $3, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shrl $4, %ecx +; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: pinsrb $4, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shrl $5, %ecx +; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: pinsrb $5, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shrl $6, %ecx +; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: pinsrb $6, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shrl $7, %ecx +; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: pinsrb $7, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shrl $8, %ecx +; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: pinsrb $8, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shrl $9, %ecx +; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: pinsrb $9, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shrl $10, %ecx +; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: pinsrb $10, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shrl $11, %ecx +; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: pinsrb $11, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shrl $12, %ecx +; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: pinsrb $12, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shrl $13, %ecx +; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: pinsrb $13, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shrl $14, %ecx +; X32-SSE41-NEXT: andl $1, %ecx +; X32-SSE41-NEXT: pinsrb $14, %ecx, %xmm1 +; X32-SSE41-NEXT: shrl $15, %eax +; X32-SSE41-NEXT: pinsrb $15, %eax, %xmm1 +; X32-SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; X32-SSE41-NEXT: psllw $15, %xmm0 +; X32-SSE41-NEXT: psraw $15, %xmm0 +; X32-SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X32-SSE41-NEXT: psllw $15, %xmm1 +; X32-SSE41-NEXT: psraw $15, %xmm1 +; X32-SSE41-NEXT: retl +entry: + %X = load <16 x i1>, <16 x i1>* %ptr + %Y = sext <16 x i1> %X to <16 x i16> + ret <16 x i16> %Y +} + +define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone { +; SSE2-LABEL: load_sext_32i1_to_32i8: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: movswq (%rdi), %rax +; SSE2-NEXT: movq %rax, %r10 +; SSE2-NEXT: movq %rax, %r8 +; SSE2-NEXT: movq %rax, %r9 +; SSE2-NEXT: movq %rax, %r11 +; SSE2-NEXT: movq %rax, %r14 +; SSE2-NEXT: movq %rax, %r15 +; SSE2-NEXT: movq %rax, %r12 +; SSE2-NEXT: movq %rax, %r13 +; SSE2-NEXT: movq %rax, %rdx +; SSE2-NEXT: movq %rax, %rsi +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: movq %rax, %rbp +; SSE2-NEXT: movq %rax, %rbx +; SSE2-NEXT: shrq $15, %rbx +; SSE2-NEXT: movd %ebx, %xmm0 +; SSE2-NEXT: movq %rax, %rbx +; SSE2-NEXT: shlq $49, %r10 +; SSE2-NEXT: sarq $63, %r10 +; SSE2-NEXT: movd %r10d, %xmm15 +; SSE2-NEXT: movq %rax, %r10 +; SSE2-NEXT: movsbq %al, %rax +; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-NEXT: shlq $50, %r8 +; SSE2-NEXT: sarq $63, %r8 +; SSE2-NEXT: movd %r8d, %xmm8 +; SSE2-NEXT: shlq $51, %r9 +; SSE2-NEXT: sarq $63, %r9 +; SSE2-NEXT: movd %r9d, %xmm3 +; SSE2-NEXT: shlq $52, %r11 +; SSE2-NEXT: sarq $63, %r11 +; SSE2-NEXT: movd %r11d, %xmm9 +; SSE2-NEXT: shlq $53, %r14 +; SSE2-NEXT: sarq $63, %r14 +; SSE2-NEXT: movd %r14d, %xmm6 +; SSE2-NEXT: shlq $54, %r15 +; SSE2-NEXT: sarq $63, %r15 +; SSE2-NEXT: movd %r15d, %xmm10 +; SSE2-NEXT: shlq $55, %r12 +; SSE2-NEXT: sarq $63, %r12 +; SSE2-NEXT: movd %r12d, %xmm2 +; SSE2-NEXT: shlq $60, %r13 +; SSE2-NEXT: sarq $63, %r13 +; SSE2-NEXT: movd %r13d, %xmm11 +; SSE2-NEXT: shlq $61, %rdx +; SSE2-NEXT: sarq $63, %rdx +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: shlq $62, %rsi +; SSE2-NEXT: sarq $63, %rsi +; SSE2-NEXT: movd %esi, %xmm12 +; SSE2-NEXT: shlq $63, %rcx +; SSE2-NEXT: sarq $63, %rcx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: shlq $58, %rbp +; SSE2-NEXT: sarq $63, %rbp +; SSE2-NEXT: movd %ebp, %xmm13 +; SSE2-NEXT: shlq $59, %rbx +; SSE2-NEXT: sarq $63, %rbx +; SSE2-NEXT: movd %ebx, %xmm7 +; SSE2-NEXT: shlq $57, %r10 +; SSE2-NEXT: sarq $63, %r10 +; SSE2-NEXT: movd %r10d, %xmm4 +; SSE2-NEXT: shrq $7, %rax +; SSE2-NEXT: movd %eax, %xmm14 +; SSE2-NEXT: movswq 2(%rdi), %rsi +; SSE2-NEXT: movq %rsi, %r8 +; SSE2-NEXT: movq %rsi, %r9 +; SSE2-NEXT: movq %rsi, %r10 +; SSE2-NEXT: movq %rsi, %r11 +; SSE2-NEXT: movq %rsi, %r14 +; SSE2-NEXT: movq %rsi, %r15 +; SSE2-NEXT: movq %rsi, %r12 +; SSE2-NEXT: movq %rsi, %r13 +; SSE2-NEXT: movq %rsi, %rbx +; SSE2-NEXT: movq %rsi, %rax +; SSE2-NEXT: movq %rsi, %rcx +; SSE2-NEXT: movq %rsi, %rdx +; SSE2-NEXT: movq %rsi, %rdi +; SSE2-NEXT: movq %rsi, %rbp +; SSE2-NEXT: shrq $15, %rbp +; SSE2-NEXT: movd %ebp, %xmm1 +; SSE2-NEXT: movq %rsi, %rbp +; SSE2-NEXT: movsbq %sil, %rsi +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] +; SSE2-NEXT: shlq $49, %r8 +; SSE2-NEXT: sarq $63, %r8 +; SSE2-NEXT: movd %r8d, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE2-NEXT: shlq $50, %r9 +; SSE2-NEXT: sarq $63, %r9 +; SSE2-NEXT: movd %r9d, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE2-NEXT: shlq $51, %r10 +; SSE2-NEXT: sarq $63, %r10 +; SSE2-NEXT: movd %r10d, %xmm5 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: shlq $52, %r11 +; SSE2-NEXT: sarq $63, %r11 +; SSE2-NEXT: movd %r11d, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-NEXT: shlq $53, %r14 +; SSE2-NEXT: sarq $63, %r14 +; SSE2-NEXT: movd %r14d, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE2-NEXT: shlq $54, %r15 +; SSE2-NEXT: sarq $63, %r15 +; SSE2-NEXT: movd %r15d, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE2-NEXT: shlq $55, %r12 +; SSE2-NEXT: sarq $63, %r12 +; SSE2-NEXT: movd %r12d, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: shlq $60, %r13 +; SSE2-NEXT: sarq $63, %r13 +; SSE2-NEXT: movd %r13d, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: shlq $61, %rbx +; SSE2-NEXT: sarq $63, %rbx +; SSE2-NEXT: movd %ebx, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-NEXT: shlq $62, %rax +; SSE2-NEXT: sarq $63, %rax +; SSE2-NEXT: movd %eax, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE2-NEXT: shlq $63, %rcx +; SSE2-NEXT: sarq $63, %rcx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE2-NEXT: shlq $58, %rdx +; SSE2-NEXT: sarq $63, %rdx +; SSE2-NEXT: movd %edx, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: shlq $59, %rdi +; SSE2-NEXT: sarq $63, %rdi +; SSE2-NEXT: movd %edi, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE2-NEXT: shlq $57, %rbp +; SSE2-NEXT: sarq $63, %rbp +; SSE2-NEXT: movd %ebp, %xmm2 +; SSE2-NEXT: shrq $7, %rsi +; SSE2-NEXT: movd %esi, %xmm5 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_32i1_to_32i8: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pushq %rbp +; SSSE3-NEXT: pushq %r15 +; SSSE3-NEXT: pushq %r14 +; SSSE3-NEXT: pushq %r13 +; SSSE3-NEXT: pushq %r12 +; SSSE3-NEXT: pushq %rbx +; SSSE3-NEXT: movswq (%rdi), %rax +; SSSE3-NEXT: movq %rax, %r10 +; SSSE3-NEXT: movq %rax, %r8 +; SSSE3-NEXT: movq %rax, %r9 +; SSSE3-NEXT: movq %rax, %r11 +; SSSE3-NEXT: movq %rax, %r14 +; SSSE3-NEXT: movq %rax, %r15 +; SSSE3-NEXT: movq %rax, %r12 +; SSSE3-NEXT: movq %rax, %r13 +; SSSE3-NEXT: movq %rax, %rdx +; SSSE3-NEXT: movq %rax, %rsi +; SSSE3-NEXT: movq %rax, %rcx +; SSSE3-NEXT: movq %rax, %rbp +; SSSE3-NEXT: movq %rax, %rbx +; SSSE3-NEXT: shrq $15, %rbx +; SSSE3-NEXT: movd %ebx, %xmm0 +; SSSE3-NEXT: movq %rax, %rbx +; SSSE3-NEXT: shlq $49, %r10 +; SSSE3-NEXT: sarq $63, %r10 +; SSSE3-NEXT: movd %r10d, %xmm15 +; SSSE3-NEXT: movq %rax, %r10 +; SSSE3-NEXT: movsbq %al, %rax +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSSE3-NEXT: shlq $50, %r8 +; SSSE3-NEXT: sarq $63, %r8 +; SSSE3-NEXT: movd %r8d, %xmm8 +; SSSE3-NEXT: shlq $51, %r9 +; SSSE3-NEXT: sarq $63, %r9 +; SSSE3-NEXT: movd %r9d, %xmm3 +; SSSE3-NEXT: shlq $52, %r11 +; SSSE3-NEXT: sarq $63, %r11 +; SSSE3-NEXT: movd %r11d, %xmm9 +; SSSE3-NEXT: shlq $53, %r14 +; SSSE3-NEXT: sarq $63, %r14 +; SSSE3-NEXT: movd %r14d, %xmm6 +; SSSE3-NEXT: shlq $54, %r15 +; SSSE3-NEXT: sarq $63, %r15 +; SSSE3-NEXT: movd %r15d, %xmm10 +; SSSE3-NEXT: shlq $55, %r12 +; SSSE3-NEXT: sarq $63, %r12 +; SSSE3-NEXT: movd %r12d, %xmm2 +; SSSE3-NEXT: shlq $60, %r13 +; SSSE3-NEXT: sarq $63, %r13 +; SSSE3-NEXT: movd %r13d, %xmm11 +; SSSE3-NEXT: shlq $61, %rdx +; SSSE3-NEXT: sarq $63, %rdx +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: shlq $62, %rsi +; SSSE3-NEXT: sarq $63, %rsi +; SSSE3-NEXT: movd %esi, %xmm12 +; SSSE3-NEXT: shlq $63, %rcx +; SSSE3-NEXT: sarq $63, %rcx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: shlq $58, %rbp +; SSSE3-NEXT: sarq $63, %rbp +; SSSE3-NEXT: movd %ebp, %xmm13 +; SSSE3-NEXT: shlq $59, %rbx +; SSSE3-NEXT: sarq $63, %rbx +; SSSE3-NEXT: movd %ebx, %xmm7 +; SSSE3-NEXT: shlq $57, %r10 +; SSSE3-NEXT: sarq $63, %r10 +; SSSE3-NEXT: movd %r10d, %xmm4 +; SSSE3-NEXT: shrq $7, %rax +; SSSE3-NEXT: movd %eax, %xmm14 +; SSSE3-NEXT: movswq 2(%rdi), %rsi +; SSSE3-NEXT: movq %rsi, %r8 +; SSSE3-NEXT: movq %rsi, %r9 +; SSSE3-NEXT: movq %rsi, %r10 +; SSSE3-NEXT: movq %rsi, %r11 +; SSSE3-NEXT: movq %rsi, %r14 +; SSSE3-NEXT: movq %rsi, %r15 +; SSSE3-NEXT: movq %rsi, %r12 +; SSSE3-NEXT: movq %rsi, %r13 +; SSSE3-NEXT: movq %rsi, %rbx +; SSSE3-NEXT: movq %rsi, %rax +; SSSE3-NEXT: movq %rsi, %rcx +; SSSE3-NEXT: movq %rsi, %rdx +; SSSE3-NEXT: movq %rsi, %rdi +; SSSE3-NEXT: movq %rsi, %rbp +; SSSE3-NEXT: shrq $15, %rbp +; SSSE3-NEXT: movd %ebp, %xmm1 +; SSSE3-NEXT: movq %rsi, %rbp +; SSSE3-NEXT: movsbq %sil, %rsi +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] +; SSSE3-NEXT: shlq $49, %r8 +; SSSE3-NEXT: sarq $63, %r8 +; SSSE3-NEXT: movd %r8d, %xmm3 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSSE3-NEXT: shlq $50, %r9 +; SSSE3-NEXT: sarq $63, %r9 +; SSSE3-NEXT: movd %r9d, %xmm4 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSSE3-NEXT: shlq $51, %r10 +; SSSE3-NEXT: sarq $63, %r10 +; SSSE3-NEXT: movd %r10d, %xmm5 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-NEXT: shlq $52, %r11 +; SSSE3-NEXT: sarq $63, %r11 +; SSSE3-NEXT: movd %r11d, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSSE3-NEXT: shlq $53, %r14 +; SSSE3-NEXT: sarq $63, %r14 +; SSSE3-NEXT: movd %r14d, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSSE3-NEXT: shlq $54, %r15 +; SSSE3-NEXT: sarq $63, %r15 +; SSSE3-NEXT: movd %r15d, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSSE3-NEXT: shlq $55, %r12 +; SSSE3-NEXT: sarq $63, %r12 +; SSSE3-NEXT: movd %r12d, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: shlq $60, %r13 +; SSSE3-NEXT: sarq $63, %r13 +; SSSE3-NEXT: movd %r13d, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSSE3-NEXT: shlq $61, %rbx +; SSSE3-NEXT: sarq $63, %rbx +; SSSE3-NEXT: movd %ebx, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSSE3-NEXT: shlq $62, %rax +; SSSE3-NEXT: sarq $63, %rax +; SSSE3-NEXT: movd %eax, %xmm6 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSSE3-NEXT: shlq $63, %rcx +; SSSE3-NEXT: sarq $63, %rcx +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSSE3-NEXT: shlq $58, %rdx +; SSSE3-NEXT: sarq $63, %rdx +; SSSE3-NEXT: movd %edx, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSSE3-NEXT: shlq $59, %rdi +; SSSE3-NEXT: sarq $63, %rdi +; SSSE3-NEXT: movd %edi, %xmm4 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSSE3-NEXT: shlq $57, %rbp +; SSSE3-NEXT: sarq $63, %rbp +; SSSE3-NEXT: movd %ebp, %xmm2 +; SSSE3-NEXT: shrq $7, %rsi +; SSSE3-NEXT: movd %esi, %xmm5 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r12 +; SSSE3-NEXT: popq %r13 +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %r15 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_32i1_to_32i8: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movswq (%rdi), %rax +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $62, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: movq %rax, %rdx +; SSE41-NEXT: shlq $63, %rdx +; SSE41-NEXT: sarq $63, %rdx +; SSE41-NEXT: movd %edx, %xmm0 +; SSE41-NEXT: pinsrb $1, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $61, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $2, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $60, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $3, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $59, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $4, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $58, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $5, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $57, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $6, %ecx, %xmm0 +; SSE41-NEXT: movsbq %al, %rcx +; SSE41-NEXT: shrq $7, %rcx +; SSE41-NEXT: pinsrb $7, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $55, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $8, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $54, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $9, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $53, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $10, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $52, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $11, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $51, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $12, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $50, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $13, %ecx, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $49, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $14, %ecx, %xmm0 +; SSE41-NEXT: shrq $15, %rax +; SSE41-NEXT: pinsrb $15, %eax, %xmm0 +; SSE41-NEXT: movswq 2(%rdi), %rax +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $62, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: movq %rax, %rdx +; SSE41-NEXT: shlq $63, %rdx +; SSE41-NEXT: sarq $63, %rdx +; SSE41-NEXT: movd %edx, %xmm1 +; SSE41-NEXT: pinsrb $1, %ecx, %xmm1 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $61, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $2, %ecx, %xmm1 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $60, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $3, %ecx, %xmm1 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $59, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $4, %ecx, %xmm1 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $58, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $5, %ecx, %xmm1 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $57, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $6, %ecx, %xmm1 +; SSE41-NEXT: movsbq %al, %rcx +; SSE41-NEXT: shrq $7, %rcx +; SSE41-NEXT: pinsrb $7, %ecx, %xmm1 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $55, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $8, %ecx, %xmm1 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $54, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $9, %ecx, %xmm1 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $53, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $10, %ecx, %xmm1 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $52, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $11, %ecx, %xmm1 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $51, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $12, %ecx, %xmm1 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $50, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $13, %ecx, %xmm1 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shlq $49, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: pinsrb $14, %ecx, %xmm1 +; SSE41-NEXT: shrq $15, %rax +; SSE41-NEXT: pinsrb $15, %eax, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_sext_32i1_to_32i8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: pushq %r15 +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: pushq %r12 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: movslq (%rdi), %rax +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $47, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vmovd %ecx, %xmm0 +; AVX1-NEXT: movq %rax, %r8 +; AVX1-NEXT: movq %rax, %rdx +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: movq %rax, %rdi +; AVX1-NEXT: movq %rax, %r13 +; AVX1-NEXT: movq %rax, %rsi +; AVX1-NEXT: movq %rax, %r10 +; AVX1-NEXT: movq %rax, %r11 +; AVX1-NEXT: movq %rax, %r9 +; AVX1-NEXT: movq %rax, %rbx +; AVX1-NEXT: movq %rax, %r14 +; AVX1-NEXT: movq %rax, %r15 +; AVX1-NEXT: movq %rax, %r12 +; AVX1-NEXT: movq %rax, %rbp +; AVX1-NEXT: shlq $46, %rbp +; AVX1-NEXT: sarq $63, %rbp +; AVX1-NEXT: vpinsrb $1, %ebp, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rbp +; AVX1-NEXT: shlq $45, %r8 +; AVX1-NEXT: sarq $63, %r8 +; AVX1-NEXT: vpinsrb $2, %r8d, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %r8 +; AVX1-NEXT: shlq $44, %rdx +; AVX1-NEXT: sarq $63, %rdx +; AVX1-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rdx +; AVX1-NEXT: shlq $43, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $42, %rdi +; AVX1-NEXT: sarq $63, %rdi +; AVX1-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rdi +; AVX1-NEXT: shlq $41, %r13 +; AVX1-NEXT: sarq $63, %r13 +; AVX1-NEXT: vpinsrb $6, %r13d, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %r13 +; AVX1-NEXT: shlq $40, %rsi +; AVX1-NEXT: sarq $63, %rsi +; AVX1-NEXT: vpinsrb $7, %esi, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rsi +; AVX1-NEXT: shlq $39, %r10 +; AVX1-NEXT: sarq $63, %r10 +; AVX1-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %r10 +; AVX1-NEXT: shlq $38, %r11 +; AVX1-NEXT: sarq $63, %r11 +; AVX1-NEXT: vpinsrb $9, %r11d, %xmm0, %xmm0 +; AVX1-NEXT: movsbq %al, %r11 +; AVX1-NEXT: shlq $37, %r9 +; AVX1-NEXT: sarq $63, %r9 +; AVX1-NEXT: vpinsrb $10, %r9d, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %r9 +; AVX1-NEXT: shlq $36, %rbx +; AVX1-NEXT: sarq $63, %rbx +; AVX1-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rbx +; AVX1-NEXT: shlq $35, %r14 +; AVX1-NEXT: sarq $63, %r14 +; AVX1-NEXT: vpinsrb $12, %r14d, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %r14 +; AVX1-NEXT: shlq $34, %r15 +; AVX1-NEXT: sarq $63, %r15 +; AVX1-NEXT: vpinsrb $13, %r15d, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %r15 +; AVX1-NEXT: shlq $33, %r12 +; AVX1-NEXT: sarq $63, %r12 +; AVX1-NEXT: vpinsrb $14, %r12d, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %r12 +; AVX1-NEXT: shrq $31, %rbp +; AVX1-NEXT: vpinsrb $15, %ebp, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rbp +; AVX1-NEXT: shlq $63, %rdx +; AVX1-NEXT: sarq $63, %rdx +; AVX1-NEXT: vmovd %edx, %xmm1 +; AVX1-NEXT: movq %rax, %rdx +; AVX1-NEXT: movswq %ax, %rax +; AVX1-NEXT: shlq $62, %r8 +; AVX1-NEXT: sarq $63, %r8 +; AVX1-NEXT: vpinsrb $1, %r8d, %xmm1, %xmm1 +; AVX1-NEXT: shlq $61, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: shlq $60, %rdi +; AVX1-NEXT: sarq $63, %rdi +; AVX1-NEXT: vpinsrb $3, %edi, %xmm1, %xmm1 +; AVX1-NEXT: shlq $59, %r13 +; AVX1-NEXT: sarq $63, %r13 +; AVX1-NEXT: vpinsrb $4, %r13d, %xmm1, %xmm1 +; AVX1-NEXT: shlq $58, %rsi +; AVX1-NEXT: sarq $63, %rsi +; AVX1-NEXT: vpinsrb $5, %esi, %xmm1, %xmm1 +; AVX1-NEXT: shlq $57, %r10 +; AVX1-NEXT: sarq $63, %r10 +; AVX1-NEXT: vpinsrb $6, %r10d, %xmm1, %xmm1 +; AVX1-NEXT: shrq $7, %r11 +; AVX1-NEXT: vpinsrb $7, %r11d, %xmm1, %xmm1 +; AVX1-NEXT: shlq $55, %r9 +; AVX1-NEXT: sarq $63, %r9 +; AVX1-NEXT: vpinsrb $8, %r9d, %xmm1, %xmm1 +; AVX1-NEXT: shlq $54, %rbx +; AVX1-NEXT: sarq $63, %rbx +; AVX1-NEXT: vpinsrb $9, %ebx, %xmm1, %xmm1 +; AVX1-NEXT: shlq $53, %r14 +; AVX1-NEXT: sarq $63, %r14 +; AVX1-NEXT: vpinsrb $10, %r14d, %xmm1, %xmm1 +; AVX1-NEXT: shlq $52, %r15 +; AVX1-NEXT: sarq $63, %r15 +; AVX1-NEXT: vpinsrb $11, %r15d, %xmm1, %xmm1 +; AVX1-NEXT: shlq $51, %r12 +; AVX1-NEXT: sarq $63, %r12 +; AVX1-NEXT: vpinsrb $12, %r12d, %xmm1, %xmm1 +; AVX1-NEXT: shlq $50, %rbp +; AVX1-NEXT: sarq $63, %rbp +; AVX1-NEXT: vpinsrb $13, %ebp, %xmm1, %xmm1 +; AVX1-NEXT: shlq $49, %rdx +; AVX1-NEXT: sarq $63, %rdx +; AVX1-NEXT: vpinsrb $14, %edx, %xmm1, %xmm1 +; AVX1-NEXT: shrq $15, %rax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: popq %r13 +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_32i1_to_32i8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: movslq (%rdi), %rax +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $47, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vmovd %ecx, %xmm0 +; AVX2-NEXT: movq %rax, %r8 +; AVX2-NEXT: movq %rax, %rdx +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: movq %rax, %rdi +; AVX2-NEXT: movq %rax, %r13 +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: movq %rax, %r10 +; AVX2-NEXT: movq %rax, %r11 +; AVX2-NEXT: movq %rax, %r9 +; AVX2-NEXT: movq %rax, %rbx +; AVX2-NEXT: movq %rax, %r14 +; AVX2-NEXT: movq %rax, %r15 +; AVX2-NEXT: movq %rax, %r12 +; AVX2-NEXT: movq %rax, %rbp +; AVX2-NEXT: shlq $46, %rbp +; AVX2-NEXT: sarq $63, %rbp +; AVX2-NEXT: vpinsrb $1, %ebp, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rbp +; AVX2-NEXT: shlq $45, %r8 +; AVX2-NEXT: sarq $63, %r8 +; AVX2-NEXT: vpinsrb $2, %r8d, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %r8 +; AVX2-NEXT: shlq $44, %rdx +; AVX2-NEXT: sarq $63, %rdx +; AVX2-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rdx +; AVX2-NEXT: shlq $43, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $42, %rdi +; AVX2-NEXT: sarq $63, %rdi +; AVX2-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rdi +; AVX2-NEXT: shlq $41, %r13 +; AVX2-NEXT: sarq $63, %r13 +; AVX2-NEXT: vpinsrb $6, %r13d, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %r13 +; AVX2-NEXT: shlq $40, %rsi +; AVX2-NEXT: sarq $63, %rsi +; AVX2-NEXT: vpinsrb $7, %esi, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: shlq $39, %r10 +; AVX2-NEXT: sarq $63, %r10 +; AVX2-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %r10 +; AVX2-NEXT: shlq $38, %r11 +; AVX2-NEXT: sarq $63, %r11 +; AVX2-NEXT: vpinsrb $9, %r11d, %xmm0, %xmm0 +; AVX2-NEXT: movsbq %al, %r11 +; AVX2-NEXT: shlq $37, %r9 +; AVX2-NEXT: sarq $63, %r9 +; AVX2-NEXT: vpinsrb $10, %r9d, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %r9 +; AVX2-NEXT: shlq $36, %rbx +; AVX2-NEXT: sarq $63, %rbx +; AVX2-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rbx +; AVX2-NEXT: shlq $35, %r14 +; AVX2-NEXT: sarq $63, %r14 +; AVX2-NEXT: vpinsrb $12, %r14d, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %r14 +; AVX2-NEXT: shlq $34, %r15 +; AVX2-NEXT: sarq $63, %r15 +; AVX2-NEXT: vpinsrb $13, %r15d, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %r15 +; AVX2-NEXT: shlq $33, %r12 +; AVX2-NEXT: sarq $63, %r12 +; AVX2-NEXT: vpinsrb $14, %r12d, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %r12 +; AVX2-NEXT: shrq $31, %rbp +; AVX2-NEXT: vpinsrb $15, %ebp, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rbp +; AVX2-NEXT: shlq $63, %rdx +; AVX2-NEXT: sarq $63, %rdx +; AVX2-NEXT: vmovd %edx, %xmm1 +; AVX2-NEXT: movq %rax, %rdx +; AVX2-NEXT: movswq %ax, %rax +; AVX2-NEXT: shlq $62, %r8 +; AVX2-NEXT: sarq $63, %r8 +; AVX2-NEXT: vpinsrb $1, %r8d, %xmm1, %xmm1 +; AVX2-NEXT: shlq $61, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: shlq $60, %rdi +; AVX2-NEXT: sarq $63, %rdi +; AVX2-NEXT: vpinsrb $3, %edi, %xmm1, %xmm1 +; AVX2-NEXT: shlq $59, %r13 +; AVX2-NEXT: sarq $63, %r13 +; AVX2-NEXT: vpinsrb $4, %r13d, %xmm1, %xmm1 +; AVX2-NEXT: shlq $58, %rsi +; AVX2-NEXT: sarq $63, %rsi +; AVX2-NEXT: vpinsrb $5, %esi, %xmm1, %xmm1 +; AVX2-NEXT: shlq $57, %r10 +; AVX2-NEXT: sarq $63, %r10 +; AVX2-NEXT: vpinsrb $6, %r10d, %xmm1, %xmm1 +; AVX2-NEXT: shrq $7, %r11 +; AVX2-NEXT: vpinsrb $7, %r11d, %xmm1, %xmm1 +; AVX2-NEXT: shlq $55, %r9 +; AVX2-NEXT: sarq $63, %r9 +; AVX2-NEXT: vpinsrb $8, %r9d, %xmm1, %xmm1 +; AVX2-NEXT: shlq $54, %rbx +; AVX2-NEXT: sarq $63, %rbx +; AVX2-NEXT: vpinsrb $9, %ebx, %xmm1, %xmm1 +; AVX2-NEXT: shlq $53, %r14 +; AVX2-NEXT: sarq $63, %r14 +; AVX2-NEXT: vpinsrb $10, %r14d, %xmm1, %xmm1 +; AVX2-NEXT: shlq $52, %r15 +; AVX2-NEXT: sarq $63, %r15 +; AVX2-NEXT: vpinsrb $11, %r15d, %xmm1, %xmm1 +; AVX2-NEXT: shlq $51, %r12 +; AVX2-NEXT: sarq $63, %r12 +; AVX2-NEXT: vpinsrb $12, %r12d, %xmm1, %xmm1 +; AVX2-NEXT: shlq $50, %rbp +; AVX2-NEXT: sarq $63, %rbp +; AVX2-NEXT: vpinsrb $13, %ebp, %xmm1, %xmm1 +; AVX2-NEXT: shlq $49, %rdx +; AVX2-NEXT: sarq $63, %rdx +; AVX2-NEXT: vpinsrb $14, %edx, %xmm1, %xmm1 +; AVX2-NEXT: shrq $15, %rax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512F-LABEL: load_sext_32i1_to_32i8: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: kmovw (%rdi), %k1 +; AVX512F-NEXT: kmovw 2(%rdi), %k2 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: load_sext_32i1_to_32i8: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: kmovd (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512BW-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_32i1_to_32i8: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pushl %esi +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: movswl (%eax), %ecx +; X32-SSE41-NEXT: movl %ecx, %edx +; X32-SSE41-NEXT: shll $30, %edx +; X32-SSE41-NEXT: sarl $31, %edx +; X32-SSE41-NEXT: movl %ecx, %esi +; X32-SSE41-NEXT: shll $31, %esi +; X32-SSE41-NEXT: sarl $31, %esi +; X32-SSE41-NEXT: movd %esi, %xmm0 +; X32-SSE41-NEXT: pinsrb $1, %edx, %xmm0 +; X32-SSE41-NEXT: movl %ecx, %edx +; X32-SSE41-NEXT: shll $29, %edx +; X32-SSE41-NEXT: sarl $31, %edx +; X32-SSE41-NEXT: pinsrb $2, %edx, %xmm0 +; X32-SSE41-NEXT: movl %ecx, %edx +; X32-SSE41-NEXT: shll $28, %edx +; X32-SSE41-NEXT: sarl $31, %edx +; X32-SSE41-NEXT: pinsrb $3, %edx, %xmm0 +; X32-SSE41-NEXT: movl %ecx, %edx +; X32-SSE41-NEXT: shll $27, %edx +; X32-SSE41-NEXT: sarl $31, %edx +; X32-SSE41-NEXT: pinsrb $4, %edx, %xmm0 +; X32-SSE41-NEXT: movl %ecx, %edx +; X32-SSE41-NEXT: shll $26, %edx +; X32-SSE41-NEXT: sarl $31, %edx +; X32-SSE41-NEXT: pinsrb $5, %edx, %xmm0 +; X32-SSE41-NEXT: movl %ecx, %edx +; X32-SSE41-NEXT: shll $25, %edx +; X32-SSE41-NEXT: sarl $31, %edx +; X32-SSE41-NEXT: pinsrb $6, %edx, %xmm0 +; X32-SSE41-NEXT: movsbl %cl, %edx +; X32-SSE41-NEXT: shrl $7, %edx +; X32-SSE41-NEXT: pinsrb $7, %edx, %xmm0 +; X32-SSE41-NEXT: movl %ecx, %edx +; X32-SSE41-NEXT: shll $23, %edx +; X32-SSE41-NEXT: sarl $31, %edx +; X32-SSE41-NEXT: pinsrb $8, %edx, %xmm0 +; X32-SSE41-NEXT: movl %ecx, %edx +; X32-SSE41-NEXT: shll $22, %edx +; X32-SSE41-NEXT: sarl $31, %edx +; X32-SSE41-NEXT: pinsrb $9, %edx, %xmm0 +; X32-SSE41-NEXT: movl %ecx, %edx +; X32-SSE41-NEXT: shll $21, %edx +; X32-SSE41-NEXT: sarl $31, %edx +; X32-SSE41-NEXT: pinsrb $10, %edx, %xmm0 +; X32-SSE41-NEXT: movl %ecx, %edx +; X32-SSE41-NEXT: shll $20, %edx +; X32-SSE41-NEXT: sarl $31, %edx +; X32-SSE41-NEXT: pinsrb $11, %edx, %xmm0 +; X32-SSE41-NEXT: movl %ecx, %edx +; X32-SSE41-NEXT: shll $19, %edx +; X32-SSE41-NEXT: sarl $31, %edx +; X32-SSE41-NEXT: pinsrb $12, %edx, %xmm0 +; X32-SSE41-NEXT: movl %ecx, %edx +; X32-SSE41-NEXT: shll $18, %edx +; X32-SSE41-NEXT: sarl $31, %edx +; X32-SSE41-NEXT: pinsrb $13, %edx, %xmm0 +; X32-SSE41-NEXT: movl %ecx, %edx +; X32-SSE41-NEXT: shll $17, %edx +; X32-SSE41-NEXT: sarl $31, %edx +; X32-SSE41-NEXT: pinsrb $14, %edx, %xmm0 +; X32-SSE41-NEXT: shrl $15, %ecx +; X32-SSE41-NEXT: pinsrb $15, %ecx, %xmm0 +; X32-SSE41-NEXT: movswl 2(%eax), %eax +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $30, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: movl %eax, %edx +; X32-SSE41-NEXT: shll $31, %edx +; X32-SSE41-NEXT: sarl $31, %edx +; X32-SSE41-NEXT: movd %edx, %xmm1 +; X32-SSE41-NEXT: pinsrb $1, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $29, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrb $2, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $28, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrb $3, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $27, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrb $4, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $26, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrb $5, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $25, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrb $6, %ecx, %xmm1 +; X32-SSE41-NEXT: movsbl %al, %ecx +; X32-SSE41-NEXT: shrl $7, %ecx +; X32-SSE41-NEXT: pinsrb $7, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $23, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrb $8, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $22, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrb $9, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $21, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrb $10, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $20, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrb $11, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $19, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrb $12, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $18, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrb $13, %ecx, %xmm1 +; X32-SSE41-NEXT: movl %eax, %ecx +; X32-SSE41-NEXT: shll $17, %ecx +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrb $14, %ecx, %xmm1 +; X32-SSE41-NEXT: shrl $15, %eax +; X32-SSE41-NEXT: pinsrb $15, %eax, %xmm1 +; X32-SSE41-NEXT: popl %esi +; X32-SSE41-NEXT: retl +entry: + %X = load <32 x i1>, <32 x i1>* %ptr + %Y = sext <32 x i1> %X to <32 x i8> + ret <32 x i8> %Y +} + +define <16 x i16> @load_sext_16i8_to_16i16(<16 x i8> *%ptr) { +; SSE2-LABEL: load_sext_16i8_to_16i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_16i8_to_16i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: psraw $8, %xmm0 +; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: psraw $8, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_16i8_to_16i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbw (%rdi), %xmm0 +; SSE41-NEXT: pmovsxbw 8(%rdi), %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_sext_16i8_to_16i16: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxbw 8(%rdi), %xmm0 +; AVX1-NEXT: vpmovsxbw (%rdi), %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_16i8_to_16i16: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxbw (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_sext_16i8_to_16i16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxbw (%rdi), %ymm0 +; AVX512-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_16i8_to_16i16: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxbw (%eax), %xmm0 +; X32-SSE41-NEXT: pmovsxbw 8(%eax), %xmm1 +; X32-SSE41-NEXT: retl +entry: + %X = load <16 x i8>, <16 x i8>* %ptr + %Y = sext <16 x i8> %X to <16 x i16> + ret <16 x i16> %Y +} + +define <2 x i64> @load_sext_2i16_to_2i64(<2 x i16> *%ptr) { +; SSE2-LABEL: load_sext_2i16_to_2i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_2i16_to_2i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_2i16_to_2i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxwq (%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: load_sext_2i16_to_2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovsxwq (%rdi), %xmm0 +; AVX-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_2i16_to_2i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxwq (%eax), %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <2 x i16>, <2 x i16>* %ptr + %Y = sext <2 x i16> %X to <2 x i64> + ret <2 x i64> %Y +} + +define <4 x i32> @load_sext_4i16_to_4i32(<4 x i16> *%ptr) { +; SSE2-LABEL: load_sext_4i16_to_4i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_4i16_to_4i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_4i16_to_4i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxwd (%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: load_sext_4i16_to_4i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovsxwd (%rdi), %xmm0 +; AVX-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_4i16_to_4i32: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxwd (%eax), %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <4 x i16>, <4 x i16>* %ptr + %Y = sext <4 x i16> %X to <4 x i32> + ret <4 x i32> %Y +} + +define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) { +; SSE2-LABEL: load_sext_4i16_to_4i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movswq 2(%rdi), %rax +; SSE2-NEXT: movq %rax, %xmm1 +; SSE2-NEXT: movswq (%rdi), %rax +; SSE2-NEXT: movq %rax, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movswq 6(%rdi), %rax +; SSE2-NEXT: movq %rax, %xmm2 +; SSE2-NEXT: movswq 4(%rdi), %rax +; SSE2-NEXT: movq %rax, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_4i16_to_4i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movswq 2(%rdi), %rax +; SSSE3-NEXT: movq %rax, %xmm1 +; SSSE3-NEXT: movswq (%rdi), %rax +; SSSE3-NEXT: movq %rax, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: movswq 6(%rdi), %rax +; SSSE3-NEXT: movq %rax, %xmm2 +; SSSE3-NEXT: movswq 4(%rdi), %rax +; SSSE3-NEXT: movq %rax, %xmm1 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_4i16_to_4i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxwq (%rdi), %xmm0 +; SSE41-NEXT: pmovsxwq 4(%rdi), %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_sext_4i16_to_4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxwd (%rdi), %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_4i16_to_4i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxwq (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_sext_4i16_to_4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxwq (%rdi), %ymm0 +; AVX512-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_4i16_to_4i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxwq (%eax), %xmm0 +; X32-SSE41-NEXT: pmovsxwq 4(%eax), %xmm1 +; X32-SSE41-NEXT: retl +entry: + %X = load <4 x i16>, <4 x i16>* %ptr + %Y = sext <4 x i16> %X to <4 x i64> + ret <4 x i64> %Y +} + +define <8 x i32> @load_sext_8i16_to_8i32(<8 x i16> *%ptr) { +; SSE2-LABEL: load_sext_8i16_to_8i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_8i16_to_8i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $16, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_8i16_to_8i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxwd (%rdi), %xmm0 +; SSE41-NEXT: pmovsxwd 8(%rdi), %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_sext_8i16_to_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm0 +; AVX1-NEXT: vpmovsxwd (%rdi), %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_8i16_to_8i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxwd (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_sext_8i16_to_8i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxwd (%rdi), %ymm0 +; AVX512-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_8i16_to_8i32: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxwd (%eax), %xmm0 +; X32-SSE41-NEXT: pmovsxwd 8(%eax), %xmm1 +; X32-SSE41-NEXT: retl +entry: + %X = load <8 x i16>, <8 x i16>* %ptr + %Y = sext <8 x i16> %X to <8 x i32> + ret <8 x i32> %Y +} + +define <2 x i64> @load_sext_2i32_to_2i64(<2 x i32> *%ptr) { +; SSE2-LABEL: load_sext_2i32_to_2i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_2i32_to_2i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_2i32_to_2i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxdq (%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: load_sext_2i32_to_2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovsxdq (%rdi), %xmm0 +; AVX-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_2i32_to_2i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxdq (%eax), %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <2 x i32>, <2 x i32>* %ptr + %Y = sext <2 x i32> %X to <2 x i64> + ret <2 x i64> %Y +} + +define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) { +; SSE2-LABEL: load_sext_4i32_to_4i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_4i32_to_4i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa (%rdi), %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_4i32_to_4i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxdq (%rdi), %xmm0 +; SSE41-NEXT: pmovsxdq 8(%rdi), %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_sext_4i32_to_4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovsxdq 8(%rdi), %xmm0 +; AVX1-NEXT: vpmovsxdq (%rdi), %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_4i32_to_4i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovsxdq (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_sext_4i32_to_4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxdq (%rdi), %ymm0 +; AVX512-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_4i32_to_4i64: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxdq (%eax), %xmm0 +; X32-SSE41-NEXT: pmovsxdq 8(%eax), %xmm1 +; X32-SSE41-NEXT: retl +entry: + %X = load <4 x i32>, <4 x i32>* %ptr + %Y = sext <4 x i32> %X to <4 x i64> + ret <4 x i64> %Y +} + +define i32 @sext_2i8_to_i32(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_2i8_to_i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_2i8_to_i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: psraw $8, %xmm0 +; SSSE3-NEXT: movd %xmm0, %eax +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_2i8_to_i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: retq +; +; AVX-LABEL: sext_2i8_to_i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq +; +; X32-SSE41-LABEL: sext_2i8_to_i32: +; X32-SSE41: # %bb.0: # %entry +; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm0 +; X32-SSE41-NEXT: movd %xmm0, %eax +; X32-SSE41-NEXT: retl +entry: + %Shuf = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> + %Ex = sext <2 x i8> %Shuf to <2 x i16> + %Bc = bitcast <2 x i16> %Ex to i32 + ret i32 %Bc +} + +define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { +; SSE2-LABEL: sext_4i1_to_4i64: +; SSE2: # %bb.0: +; SSE2-NEXT: pslld $31, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_4i1_to_4i64: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pslld $31, %xmm0 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_4i1_to_4i64: +; SSE41: # %bb.0: +; SSE41-NEXT: pslld $31, %xmm0 +; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxdq %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovsxdq %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_4i1_to_4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_4i1_to_4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sext_4i1_to_4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX512-NEXT: retq +; +; X32-SSE41-LABEL: sext_4i1_to_4i64: +; X32-SSE41: # %bb.0: +; X32-SSE41-NEXT: pslld $31, %xmm0 +; X32-SSE41-NEXT: psrad $31, %xmm0 +; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm1 +; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE41-NEXT: retl + %extmask = sext <4 x i1> %mask to <4 x i64> + ret <4 x i64> %extmask +} + +define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) { +; SSE2-LABEL: sext_4i8_to_4i64: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: psrad $24, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_4i8_to_4i64: +; SSSE3: # %bb.0: +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: psrad $24, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: psrad $24, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_4i8_to_4i64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbq %xmm0, %xmm2 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pmovsxbq %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_4i8_to_4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxbq %xmm0, %xmm1 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_4i8_to_4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxbq %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sext_4i8_to_4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxbq %xmm0, %ymm0 +; AVX512-NEXT: retq +; +; X32-SSE41-LABEL: sext_4i8_to_4i64: +; X32-SSE41: # %bb.0: +; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm2 +; X32-SSE41-NEXT: psrld $16, %xmm0 +; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm1 +; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE41-NEXT: retl + %extmask = sext <4 x i8> %mask to <4 x i64> + ret <4 x i64> %extmask +} + +define <32 x i8> @sext_32xi1_to_32xi8(<32 x i16> %c1, <32 x i16> %c2)nounwind { +; SSE-LABEL: sext_32xi1_to_32xi8: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqw %xmm5, %xmm1 +; SSE-NEXT: pcmpeqw %xmm4, %xmm0 +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: pcmpeqw %xmm7, %xmm3 +; SSE-NEXT: pcmpeqw %xmm6, %xmm2 +; SSE-NEXT: packsswb %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: sext_32xi1_to_32xi8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpcmpeqw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_32xi1_to_32xi8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: sext_32xi1_to_32xi8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: sext_32xi1_to_32xi8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512BW-NEXT: retq +; +; X32-SSE41-LABEL: sext_32xi1_to_32xi8: +; X32-SSE41: # %bb.0: +; X32-SSE41-NEXT: pushl %ebp +; X32-SSE41-NEXT: movl %esp, %ebp +; X32-SSE41-NEXT: andl $-16, %esp +; X32-SSE41-NEXT: subl $16, %esp +; X32-SSE41-NEXT: movdqa 8(%ebp), %xmm3 +; X32-SSE41-NEXT: pcmpeqw 40(%ebp), %xmm1 +; X32-SSE41-NEXT: pcmpeqw 24(%ebp), %xmm0 +; X32-SSE41-NEXT: packsswb %xmm1, %xmm0 +; X32-SSE41-NEXT: pcmpeqw 72(%ebp), %xmm3 +; X32-SSE41-NEXT: pcmpeqw 56(%ebp), %xmm2 +; X32-SSE41-NEXT: packsswb %xmm3, %xmm2 +; X32-SSE41-NEXT: movdqa %xmm2, %xmm1 +; X32-SSE41-NEXT: movl %ebp, %esp +; X32-SSE41-NEXT: popl %ebp +; X32-SSE41-NEXT: retl + %a = icmp eq <32 x i16> %c1, %c2 + %b = sext <32 x i1> %a to <32 x i8> + ret <32 x i8> %b +} + +define <2 x i32> @sext_2i8_to_2i32(<2 x i8>* %addr) { +; SSE2-LABEL: sext_2i8_to_2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movzwl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_2i8_to_2i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movzwl (%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm0 +; SSSE3-NEXT: paddd %xmm0, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_2i8_to_2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movsbl 1(%rdi), %eax +; SSE41-NEXT: movsbl (%rdi), %ecx +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: pinsrd $1, %eax, %xmm0 +; SSE41-NEXT: paddd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: sext_2i8_to_2i32: +; AVX: # %bb.0: +; AVX-NEXT: movsbl 1(%rdi), %eax +; AVX-NEXT: movsbl (%rdi), %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X32-SSE41-LABEL: sext_2i8_to_2i32: +; X32-SSE41: # %bb.0: +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: movsbl 1(%eax), %ecx +; X32-SSE41-NEXT: movsbl (%eax), %eax +; X32-SSE41-NEXT: movd %eax, %xmm0 +; X32-SSE41-NEXT: pinsrd $1, %ecx, %xmm0 +; X32-SSE41-NEXT: paddd %xmm0, %xmm0 +; X32-SSE41-NEXT: retl + %x = load <2 x i8>, <2 x i8>* %addr, align 1 + %y = sext <2 x i8> %x to <2 x i32> + %z = add <2 x i32>%y, %y + ret <2 x i32>%z +} + diff --git a/llvm/test/CodeGen/X86/vector-trunc-math-widen.ll b/llvm/test/CodeGen/X86/vector-trunc-math-widen.ll new file mode 100644 index 000000000000..c5095ae918ac --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-trunc-math-widen.ll @@ -0,0 +1,5690 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-SLOW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-FAST +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512BW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512DQ + +; +; add +; + +define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { +; SSE-LABEL: trunc_add_v4i64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: paddq %xmm3, %xmm1 +; SSE-NEXT: paddq %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_add_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_add_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_add_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_add_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = add <4 x i64> %a0, %a1 + %2 = trunc <4 x i64> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { +; SSE-LABEL: trunc_add_v8i64_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: paddq %xmm6, %xmm2 +; SSE-NEXT: paddq %xmm7, %xmm3 +; SSE-NEXT: paddq %xmm4, %xmm0 +; SSE-NEXT: paddq %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_add_v8i64_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_add_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpaddq %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_add_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpaddq %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_add_v8i64_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = add <8 x i64> %a0, %a1 + %2 = trunc <8 x i64> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { +; SSE-LABEL: trunc_add_v8i32_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: paddd %xmm3, %xmm1 +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_add_v8i32_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_add_v8i32_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_add_v8i32_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = add <8 x i32> %a0, %a1 + %2 = trunc <8 x i32> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { +; SSE-LABEL: trunc_add_v16i64_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: packuswb %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: packuswb %xmm5, %xmm4 +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_add_v16i64_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm7 = mem[0,0] +; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3 +; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3 +; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_add_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpaddq %ymm5, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpaddq %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddq %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpaddq %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_add_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpaddq %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpaddq %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddq %ymm7, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpaddq %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_add_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddq %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = add <16 x i64> %a0, %a1 + %2 = trunc <16 x i64> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { +; SSE-LABEL: trunc_add_v16i32_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: paddd %xmm4, %xmm0 +; SSE-NEXT: paddd %xmm5, %xmm1 +; SSE-NEXT: paddd %xmm6, %xmm2 +; SSE-NEXT: paddd %xmm7, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_add_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 +; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_add_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_add_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = add <16 x i32> %a0, %a1 + %2 = trunc <16 x i32> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { +; SSE-LABEL: trunc_add_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: paddw %xmm2, %xmm0 +; SSE-NEXT: paddw %xmm3, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_add_v16i16_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_add_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_add_v16i16_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_add_v16i16_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_add_v16i16_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %1 = add <16 x i16> %a0, %a1 + %2 = trunc <16 x i16> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) { +; SSE-LABEL: trunc_add_v8i32_v8i16_sext_8i8: +; SSE: # %bb.0: +; SSE-NEXT: pslld $16, %xmm2 +; SSE-NEXT: psrad $16, %xmm2 +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: packssdw %xmm2, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: psraw $8, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_add_v8i32_v8i16_sext_8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_add_v8i32_v8i16_sext_8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_add_v8i32_v8i16_sext_8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> + %2 = sext <8 x i8> %1 to <8 x i32> + %3 = add <8 x i32> %2, %a1 + %4 = trunc <8 x i32> %3 to <8 x i16> + ret <8 x i16> %4 +} + +; +; add to constant +; + +define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind { +; SSE-LABEL: trunc_add_const_v4i64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_add_const_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_add_const_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_add_const_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_add_const_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = add <4 x i64> %a0, + %2 = trunc <4 x i64> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind { +; SSE-LABEL: trunc_add_const_v8i64_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: paddw {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_add_const_v8i64_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_add_const_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_add_const_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_add_const_v8i64_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = add <8 x i64> %a0, + %2 = trunc <8 x i64> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind { +; SSE-LABEL: trunc_add_const_v8i32_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: paddw {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_add_const_v8i32_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_add_const_v8i32_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_add_const_v8i32_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = add <8 x i32> %a0, + %2 = trunc <8 x i32> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { +; SSE-LABEL: trunc_add_const_v16i64_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: packuswb %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: packuswb %xmm5, %xmm4 +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_add_const_v16i64_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm5 = mem[0,0] +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_add_const_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_add_const_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_add_const_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = add <16 x i64> %a0, + %2 = trunc <16 x i64> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind { +; SSE-LABEL: trunc_add_const_v16i32_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_add_const_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_add_const_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_add_const_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = add <16 x i32> %a0, + %2 = trunc <16 x i32> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind { +; SSE-LABEL: trunc_add_const_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_add_const_v16i16_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_add_const_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_add_const_v16i16_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %1 = add <16 x i16> %a0, + %2 = trunc <16 x i16> %1 to <16 x i8> + ret <16 x i8> %2 +} + +; +; sub +; + +define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { +; SSE-LABEL: trunc_sub_v4i64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: psubq %xmm3, %xmm1 +; SSE-NEXT: psubq %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_sub_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_sub_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_sub_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_sub_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = sub <4 x i64> %a0, %a1 + %2 = trunc <4 x i64> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { +; SSE-LABEL: trunc_sub_v8i64_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: psubq %xmm6, %xmm2 +; SSE-NEXT: psubq %xmm7, %xmm3 +; SSE-NEXT: psubq %xmm4, %xmm0 +; SSE-NEXT: psubq %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_sub_v8i64_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_sub_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpsubq %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpsubq %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_sub_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpsubq %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpsubq %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_sub_v8i64_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = sub <8 x i64> %a0, %a1 + %2 = trunc <8 x i64> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { +; SSE-LABEL: trunc_sub_v8i32_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: psubd %xmm2, %xmm0 +; SSE-NEXT: psubd %xmm3, %xmm1 +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_sub_v8i32_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_sub_v8i32_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_sub_v8i32_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = sub <8 x i32> %a0, %a1 + %2 = trunc <8 x i32> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { +; SSE-LABEL: trunc_sub_v16i64_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: packuswb %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: packuswb %xmm5, %xmm4 +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_sub_v16i64_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm7 = mem[0,0] +; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3 +; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3 +; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_sub_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpsubq %ymm5, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpsubq %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpsubq %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpsubq %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_sub_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpsubq %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpsubq %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpsubq %ymm7, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpsubq %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_sub_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubq %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = sub <16 x i64> %a0, %a1 + %2 = trunc <16 x i64> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { +; SSE-LABEL: trunc_sub_v16i32_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: psubd %xmm4, %xmm0 +; SSE-NEXT: psubd %xmm5, %xmm1 +; SSE-NEXT: psubd %xmm6, %xmm2 +; SSE-NEXT: psubd %xmm7, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_sub_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 +; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_sub_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_sub_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = sub <16 x i32> %a0, %a1 + %2 = trunc <16 x i32> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { +; SSE-LABEL: trunc_sub_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: psubw %xmm2, %xmm0 +; SSE-NEXT: psubw %xmm3, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_sub_v16i16_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_sub_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_sub_v16i16_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_sub_v16i16_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsubw %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_sub_v16i16_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpsubw %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %1 = sub <16 x i16> %a0, %a1 + %2 = trunc <16 x i16> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_ext_sub_v16i16_v16i8(<16 x i8> %x, <16 x i8> %y) { +; SSE-LABEL: trunc_ext_sub_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: psubb %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: trunc_ext_sub_v16i16_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %a = zext <16 x i8> %x to <16 x i16> + %b = zext <16 x i8> %y to <16 x i16> + %c = sub <16 x i16> %a, %b + %d = trunc <16 x i16> %c to <16 x i8> + ret <16 x i8> %d +} + +; +; sub to constant +; + +define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind { +; SSE-LABEL: trunc_sub_const_v4i64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: movl $1, %eax +; SSE-NEXT: movq %rax, %xmm2 +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] +; SSE-NEXT: psubq %xmm2, %xmm0 +; SSE-NEXT: psubq {{.*}}(%rip), %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_sub_const_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: movl $1, %eax +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_sub_const_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_sub_const_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = sub <4 x i64> %a0, + %2 = trunc <4 x i64> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind { +; SSE-LABEL: trunc_sub_const_v8i64_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: movl $1, %eax +; SSE-NEXT: movq %rax, %xmm4 +; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] +; SSE-NEXT: psubq %xmm4, %xmm0 +; SSE-NEXT: psubq {{.*}}(%rip), %xmm1 +; SSE-NEXT: psubq {{.*}}(%rip), %xmm2 +; SSE-NEXT: psubq {{.*}}(%rip), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_sub_const_v8i64_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: movl $1, %eax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] +; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] +; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_sub_const_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_sub_const_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_sub_const_v8i64_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = sub <8 x i64> %a0, + %2 = trunc <8 x i64> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind { +; SSE-LABEL: trunc_sub_const_v8i32_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: psubd {{.*}}(%rip), %xmm0 +; SSE-NEXT: psubd {{.*}}(%rip), %xmm1 +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_sub_const_v8i32_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_sub_const_v8i32_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_sub_const_v8i32_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = sub <8 x i32> %a0, + %2 = trunc <8 x i32> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { +; SSE-LABEL: trunc_sub_const_v16i64_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movl $1, %eax +; SSE-NEXT: movq %rax, %xmm8 +; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7] +; SSE-NEXT: psubq %xmm8, %xmm0 +; SSE-NEXT: psubq {{.*}}(%rip), %xmm1 +; SSE-NEXT: psubq {{.*}}(%rip), %xmm2 +; SSE-NEXT: psubq {{.*}}(%rip), %xmm3 +; SSE-NEXT: psubq {{.*}}(%rip), %xmm4 +; SSE-NEXT: psubq {{.*}}(%rip), %xmm5 +; SSE-NEXT: psubq {{.*}}(%rip), %xmm6 +; SSE-NEXT: psubq {{.*}}(%rip), %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: packuswb %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: packuswb %xmm5, %xmm4 +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_sub_const_v16i64_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: movl $1, %eax +; AVX1-NEXT: vmovq %rax, %xmm4 +; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] +; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm2, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm3, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm4 = mem[0,0] +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7 +; AVX1-NEXT: vpackusdw %xmm3, %xmm7, %xmm3 +; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6 +; AVX1-NEXT: vpackusdw %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3 +; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3 +; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_sub_const_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_sub_const_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_sub_const_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = sub <16 x i64> %a0, + %2 = trunc <16 x i64> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind { +; SSE-LABEL: trunc_sub_const_v16i32_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: psubd {{.*}}(%rip), %xmm0 +; SSE-NEXT: psubd {{.*}}(%rip), %xmm1 +; SSE-NEXT: psubd {{.*}}(%rip), %xmm2 +; SSE-NEXT: psubd {{.*}}(%rip), %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_sub_const_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_sub_const_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_sub_const_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = sub <16 x i32> %a0, + %2 = trunc <16 x i32> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind { +; SSE-LABEL: trunc_sub_const_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: psubw {{.*}}(%rip), %xmm0 +; SSE-NEXT: psubw {{.*}}(%rip), %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_sub_const_v16i16_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_sub_const_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %1 = sub <16 x i16> %a0, + %2 = trunc <16 x i16> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_ext_sub_const_rhs_v16i16_v16i8(<16 x i8> %x) { +; SSE-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: psubb {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq + %a = zext <16 x i8> %x to <16 x i16> + %b = sub <16 x i16> %a, + %c = trunc <16 x i16> %b to <16 x i8> + ret <16 x i8> %c +} + +define <16 x i8> @trunc_ext_sub_const_lhs_v16i16_v16i8(<16 x i8> %x) { +; SSE-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; SSE-NEXT: psubb %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %a = zext <16 x i8> %x to <16 x i16> + %b = sub <16 x i16> , %a + %c = trunc <16 x i16> %b to <16 x i8> + ret <16 x i8> %c +} + +; +; mul +; + +define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { +; SSE-LABEL: trunc_mul_v4i64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: psrlq $32, %xmm4 +; SSE-NEXT: pmuludq %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: psrlq $32, %xmm5 +; SSE-NEXT: pmuludq %xmm1, %xmm5 +; SSE-NEXT: paddq %xmm4, %xmm5 +; SSE-NEXT: psllq $32, %xmm5 +; SSE-NEXT: pmuludq %xmm3, %xmm1 +; SSE-NEXT: paddq %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrlq $32, %xmm3 +; SSE-NEXT: pmuludq %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: psrlq $32, %xmm4 +; SSE-NEXT: pmuludq %xmm0, %xmm4 +; SSE-NEXT: paddq %xmm3, %xmm4 +; SSE-NEXT: psllq $32, %xmm4 +; SSE-NEXT: pmuludq %xmm2, %xmm0 +; SSE-NEXT: paddq %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_mul_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_mul_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_mul_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: trunc_mul_v4i64_v4i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_mul_v4i64_v4i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %1 = mul <4 x i64> %a0, %a1 + %2 = trunc <4 x i64> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { +; SSE-LABEL: trunc_mul_v8i64_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: pmullw %xmm6, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_mul_v8i64_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7] +; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1,2,3],xmm2[4],xmm5[5,6,7] +; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1,2,3],xmm1[4],xmm5[5,6,7] +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1,2,3],xmm0[4],xmm5[5,6,7] +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_mul_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_mul_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: trunc_mul_v8i64_v8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovqw %zmm1, %xmm1 +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_mul_v8i64_v8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1 +; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %1 = mul <8 x i64> %a0, %a1 + %2 = trunc <8 x i64> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { +; SSE-LABEL: trunc_mul_v8i32_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE-NEXT: pmuludq %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE-NEXT: pmuludq %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE-NEXT: pmuludq %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_mul_v8i32_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_mul_v8i32_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_mul_v8i32_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = mul <8 x i32> %a0, %a1 + %2 = trunc <8 x i32> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { +; SSE-LABEL: trunc_mul_v16i64_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: psrlq $32, %xmm9 +; SSE-NEXT: pmuludq %xmm8, %xmm9 +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: psrlq $32, %xmm10 +; SSE-NEXT: pmuludq %xmm0, %xmm10 +; SSE-NEXT: paddq %xmm9, %xmm10 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: psllq $32, %xmm10 +; SSE-NEXT: pmuludq %xmm8, %xmm0 +; SSE-NEXT: paddq %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: psrlq $32, %xmm8 +; SSE-NEXT: pmuludq %xmm9, %xmm8 +; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: psrlq $32, %xmm10 +; SSE-NEXT: pmuludq %xmm1, %xmm10 +; SSE-NEXT: paddq %xmm8, %xmm10 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: psllq $32, %xmm10 +; SSE-NEXT: pmuludq %xmm9, %xmm1 +; SSE-NEXT: paddq %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: psrlq $32, %xmm9 +; SSE-NEXT: pmuludq %xmm8, %xmm9 +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: psrlq $32, %xmm10 +; SSE-NEXT: pmuludq %xmm2, %xmm10 +; SSE-NEXT: paddq %xmm9, %xmm10 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: psllq $32, %xmm10 +; SSE-NEXT: pmuludq %xmm8, %xmm2 +; SSE-NEXT: paddq %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: psrlq $32, %xmm8 +; SSE-NEXT: pmuludq %xmm9, %xmm8 +; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: psrlq $32, %xmm10 +; SSE-NEXT: pmuludq %xmm3, %xmm10 +; SSE-NEXT: paddq %xmm8, %xmm10 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: psllq $32, %xmm10 +; SSE-NEXT: pmuludq %xmm9, %xmm3 +; SSE-NEXT: paddq %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: psrlq $32, %xmm9 +; SSE-NEXT: pmuludq %xmm8, %xmm9 +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: psrlq $32, %xmm10 +; SSE-NEXT: pmuludq %xmm4, %xmm10 +; SSE-NEXT: paddq %xmm9, %xmm10 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: psllq $32, %xmm10 +; SSE-NEXT: pmuludq %xmm8, %xmm4 +; SSE-NEXT: paddq %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: psrlq $32, %xmm8 +; SSE-NEXT: pmuludq %xmm9, %xmm8 +; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: psrlq $32, %xmm10 +; SSE-NEXT: pmuludq %xmm5, %xmm10 +; SSE-NEXT: paddq %xmm8, %xmm10 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: psllq $32, %xmm10 +; SSE-NEXT: pmuludq %xmm9, %xmm5 +; SSE-NEXT: paddq %xmm10, %xmm5 +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: psrlq $32, %xmm9 +; SSE-NEXT: pmuludq %xmm8, %xmm9 +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: psrlq $32, %xmm10 +; SSE-NEXT: pmuludq %xmm6, %xmm10 +; SSE-NEXT: paddq %xmm9, %xmm10 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: psllq $32, %xmm10 +; SSE-NEXT: pmuludq %xmm8, %xmm6 +; SSE-NEXT: paddq %xmm10, %xmm6 +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: psrlq $32, %xmm8 +; SSE-NEXT: pmuludq %xmm9, %xmm8 +; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: psrlq $32, %xmm10 +; SSE-NEXT: pmuludq %xmm7, %xmm10 +; SSE-NEXT: paddq %xmm8, %xmm10 +; SSE-NEXT: pmuludq %xmm9, %xmm7 +; SSE-NEXT: psllq $32, %xmm10 +; SSE-NEXT: paddq %xmm10, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: packuswb %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: packuswb %xmm5, %xmm4 +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_mul_v16i64_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm8 +; AVX1-NEXT: vpmuludq %xmm4, %xmm8, %xmm8 +; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm9 +; AVX1-NEXT: vpmuludq %xmm9, %xmm0, %xmm9 +; AVX1-NEXT: vpaddq %xmm8, %xmm9, %xmm8 +; AVX1-NEXT: vpsllq $32, %xmm8, %xmm8 +; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm9 +; AVX1-NEXT: vpaddq %xmm8, %xmm9, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm9 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4 +; AVX1-NEXT: vpmuludq %xmm9, %xmm4, %xmm10 +; AVX1-NEXT: vpsrlq $32, %xmm9, %xmm4 +; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4 +; AVX1-NEXT: vpaddq %xmm10, %xmm4, %xmm4 +; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 +; AVX1-NEXT: vpmuludq %xmm9, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm9 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 +; AVX1-NEXT: vpmuludq %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm0 +; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm4 +; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm10 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5 +; AVX1-NEXT: vpmuludq %xmm0, %xmm5, %xmm5 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4 +; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm4 +; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 +; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm0 +; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlq $32, %xmm6, %xmm4 +; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm4 +; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm4 +; AVX1-NEXT: vpmuludq %xmm0, %xmm4, %xmm4 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm6 +; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm6 +; AVX1-NEXT: vpaddq %xmm4, %xmm6, %xmm4 +; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 +; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm2 +; AVX1-NEXT: vpmuludq %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq $32, %xmm7, %xmm4 +; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm4 +; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm4 +; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm6 +; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm6 +; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm7 +; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm7 +; AVX1-NEXT: vpaddq %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6 +; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpaddq %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm4 = mem[0,0] +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3 +; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm4, %xmm10, %xmm2 +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpand %xmm4, %xmm9, %xmm2 +; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3 +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_mul_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpmulld %xmm7, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpmulld %xmm6, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpmulld %xmm5, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpmulld %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_mul_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm3 +; AVX2-FAST-NEXT: vpmulld %xmm7, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm6 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vpmulld %xmm6, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm1 +; AVX2-FAST-NEXT: vpmulld %xmm5, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpmulld %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: trunc_mul_v16i64_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovqd %zmm3, %ymm3 +; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512F-NEXT: vpmulld %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpmovqd %zmm2, %ymm2 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmulld %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_mul_v16i64_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovqd %zmm3, %ymm3 +; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512BW-NEXT: vpmulld %ymm3, %ymm1, %ymm1 +; AVX512BW-NEXT: vpmovqd %zmm2, %ymm2 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpmulld %ymm2, %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmullq %zmm3, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmullq %zmm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %1 = mul <16 x i64> %a0, %a1 + %2 = trunc <16 x i64> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { +; SSE-LABEL: trunc_mul_v16i32_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE-NEXT: pmuludq %xmm8, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] +; SSE-NEXT: pmuludq %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE-NEXT: pmuludq %xmm6, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSE-NEXT: pmuludq %xmm7, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_mul_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 +; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_mul_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_mul_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = mul <16 x i32> %a0, %a1 + %2 = trunc <16 x i32> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { +; SSE-LABEL: trunc_mul_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: pmullw %xmm2, %xmm0 +; SSE-NEXT: pmullw %xmm3, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_mul_v16i16_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_mul_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_mul_v16i16_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_mul_v16i16_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_mul_v16i16_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %1 = mul <16 x i16> %a0, %a1 + %2 = trunc <16 x i16> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) { +; SSE-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: pslld $16, %xmm2 +; SSE-NEXT: psrad $16, %xmm2 +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: packssdw %xmm2, %xmm1 +; SSE-NEXT: pmullw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> + %2 = zext <8 x i8> %1 to <8 x i32> + %3 = mul <8 x i32> %2, %a1 + %4 = trunc <8 x i32> %3 to <8 x i16> + ret <8 x i16> %4 +} + +; +; mul to constant +; + +define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind { +; SSE-LABEL: trunc_mul_const_v4i64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,3] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pmuludq %xmm2, %xmm3 +; SSE-NEXT: psrlq $32, %xmm1 +; SSE-NEXT: pmuludq %xmm2, %xmm1 +; SSE-NEXT: psllq $32, %xmm1 +; SSE-NEXT: paddq %xmm3, %xmm1 +; SSE-NEXT: movl $1, %eax +; SSE-NEXT: movq %rax, %xmm2 +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pmuludq %xmm2, %xmm3 +; SSE-NEXT: psrlq $32, %xmm0 +; SSE-NEXT: pmuludq %xmm2, %xmm0 +; SSE-NEXT: psllq $32, %xmm0 +; SSE-NEXT: paddq %xmm3, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_mul_const_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_mul_const_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_mul_const_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_mul_const_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = mul <4 x i64> %a0, + %2 = trunc <4 x i64> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind { +; SSE-LABEL: trunc_mul_const_v8i64_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_mul_const_v8i64_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_mul_const_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_mul_const_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_mul_const_v8i64_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = mul <8 x i64> %a0, + %2 = trunc <8 x i64> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind { +; SSE-LABEL: trunc_mul_const_v8i32_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_mul_const_v8i32_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_mul_const_v8i32_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_mul_const_v8i32_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = mul <8 x i32> %a0, + %2 = trunc <8 x i32> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { +; SSE-LABEL: trunc_mul_const_v16i64_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movl $1, %eax +; SSE-NEXT: movq %rax, %xmm8 +; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pmuludq %xmm8, %xmm9 +; SSE-NEXT: psrlq $32, %xmm0 +; SSE-NEXT: pmuludq %xmm8, %xmm0 +; SSE-NEXT: psllq $32, %xmm0 +; SSE-NEXT: paddq %xmm9, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [2,3] +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: pmuludq %xmm8, %xmm9 +; SSE-NEXT: psrlq $32, %xmm1 +; SSE-NEXT: pmuludq %xmm8, %xmm1 +; SSE-NEXT: psllq $32, %xmm1 +; SSE-NEXT: paddq %xmm9, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [4,5] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: pmuludq %xmm8, %xmm9 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: pmuludq %xmm8, %xmm2 +; SSE-NEXT: psllq $32, %xmm2 +; SSE-NEXT: paddq %xmm9, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [6,7] +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: pmuludq %xmm8, %xmm9 +; SSE-NEXT: psrlq $32, %xmm3 +; SSE-NEXT: pmuludq %xmm8, %xmm3 +; SSE-NEXT: psllq $32, %xmm3 +; SSE-NEXT: paddq %xmm9, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [8,9] +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: pmuludq %xmm8, %xmm9 +; SSE-NEXT: psrlq $32, %xmm4 +; SSE-NEXT: pmuludq %xmm8, %xmm4 +; SSE-NEXT: psllq $32, %xmm4 +; SSE-NEXT: paddq %xmm9, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [10,11] +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pmuludq %xmm8, %xmm9 +; SSE-NEXT: psrlq $32, %xmm5 +; SSE-NEXT: pmuludq %xmm8, %xmm5 +; SSE-NEXT: psllq $32, %xmm5 +; SSE-NEXT: paddq %xmm9, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [12,13] +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: pmuludq %xmm8, %xmm9 +; SSE-NEXT: psrlq $32, %xmm6 +; SSE-NEXT: pmuludq %xmm8, %xmm6 +; SSE-NEXT: psllq $32, %xmm6 +; SSE-NEXT: paddq %xmm9, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [14,15] +; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: pmuludq %xmm8, %xmm9 +; SSE-NEXT: psrlq $32, %xmm7 +; SSE-NEXT: pmuludq %xmm8, %xmm7 +; SSE-NEXT: psllq $32, %xmm7 +; SSE-NEXT: paddq %xmm9, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: packuswb %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: packuswb %xmm5, %xmm4 +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_mul_const_v16i64_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: movl $1, %eax +; AVX1-NEXT: vmovq %rax, %xmm4 +; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] +; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm5 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm6 +; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm4 +; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 +; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3] +; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm6 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm0, %xmm6, %xmm9 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5] +; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm6 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm7 +; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5 +; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7] +; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm7 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm7, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9] +; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm7 +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm4 +; AVX1-NEXT: vpmuludq %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 +; AVX1-NEXT: vpaddq %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [10,11] +; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm7 +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm7, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [12,13] +; AVX1-NEXT: vpmuludq %xmm6, %xmm3, %xmm7 +; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm0 +; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm0, %xmm7, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [14,15] +; AVX1-NEXT: vpmuludq %xmm6, %xmm3, %xmm7 +; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm3 +; AVX1-NEXT: vpmuludq %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 +; AVX1-NEXT: vpaddq %xmm3, %xmm7, %xmm3 +; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm6 = mem[0,0] +; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm3 +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm2 +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpand %xmm6, %xmm9, %xmm2 +; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm3 +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_mul_const_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_mul_const_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_mul_const_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = mul <16 x i64> %a0, + %2 = trunc <16 x i64> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind { +; SSE-LABEL: trunc_mul_const_v16i32_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE-NEXT: pmuludq %xmm5, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE-NEXT: pmuludq %xmm5, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,9,10,11] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE-NEXT: pmuludq %xmm5, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [12,13,14,15] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE-NEXT: pmuludq %xmm5, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_mul_const_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_mul_const_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_mul_const_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmulld {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = mul <16 x i32> %a0, + %2 = trunc <16 x i32> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind { +; SSE-LABEL: trunc_mul_const_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_mul_const_v16i16_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_mul_const_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_mul_const_v16i16_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_mul_const_v16i16_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %1 = mul <16 x i16> %a0, + %2 = trunc <16 x i16> %1 to <16 x i8> + ret <16 x i8> %2 +} + +; +; and +; + +define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { +; SSE-LABEL: trunc_and_v4i64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: andps %xmm3, %xmm1 +; SSE-NEXT: andps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_and_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_and_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_and_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_and_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = and <4 x i64> %a0, %a1 + %2 = trunc <4 x i64> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { +; SSE-LABEL: trunc_and_v8i64_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_and_v8i64_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_and_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_and_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_and_v8i64_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = and <8 x i64> %a0, %a1 + %2 = trunc <8 x i64> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { +; SSE-LABEL: trunc_and_v8i32_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_and_v8i32_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_and_v8i32_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_and_v8i32_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = and <8 x i32> %a0, %a1 + %2 = trunc <8 x i32> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { +; SSE-LABEL: trunc_and_v16i64_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: packuswb %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: packuswb %xmm5, %xmm4 +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_and_v16i64_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vandpd %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vandpd %ymm5, %ymm1, %ymm1 +; AVX1-NEXT: vandpd %ymm6, %ymm2, %ymm2 +; AVX1-NEXT: vandpd %ymm7, %ymm3, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm5 = mem[0,0] +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_and_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpand %ymm5, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_and_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpand %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_and_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = and <16 x i64> %a0, %a1 + %2 = trunc <16 x i64> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { +; SSE-LABEL: trunc_and_v16i32_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm3, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: packuswb %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm5, %xmm0 +; SSE-NEXT: packuswb %xmm6, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_and_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_and_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_and_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = and <16 x i32> %a0, %a1 + %2 = trunc <16 x i32> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { +; SSE-LABEL: trunc_and_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm3, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_and_v16i16_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_and_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_and_v16i16_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_and_v16i16_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_and_v16i16_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %1 = and <16 x i16> %a0, %a1 + %2 = trunc <16 x i16> %1 to <16 x i8> + ret <16 x i8> %2 +} + +; +; and to constant +; + +define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind { +; SSE-LABEL: trunc_and_const_v4i64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_and_const_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_and_const_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_and_const_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_and_const_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = and <4 x i64> %a0, + %2 = trunc <4 x i64> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind { +; SSE-LABEL: trunc_and_const_v8i64_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: andpd {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_and_const_v8i64_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_and_const_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_and_const_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_and_const_v8i64_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = and <8 x i64> %a0, + %2 = trunc <8 x i64> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind { +; SSE-LABEL: trunc_and_const_v8i32_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_and_const_v8i32_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_and_const_v8i32_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_and_const_v8i32_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = and <8 x i32> %a0, + %2 = trunc <8 x i32> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { +; SSE-LABEL: trunc_and_const_v16i64_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: packuswb %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: packuswb %xmm5, %xmm4 +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_and_const_v16i64_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm5 = mem[0,0] +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_and_const_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_and_const_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_and_const_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = and <16 x i64> %a0, + %2 = trunc <16 x i64> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { +; SSE-LABEL: trunc_and_const_v16i32_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_and_const_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_and_const_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_and_const_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = and <16 x i32> %a0, + %2 = trunc <16 x i32> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind { +; SSE-LABEL: trunc_and_const_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_and_const_v16i16_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_and_const_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_and_const_v16i16_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %1 = and <16 x i16> %a0, + %2 = trunc <16 x i16> %1 to <16 x i8> + ret <16 x i8> %2 +} + +; +; xor +; + +define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { +; SSE-LABEL: trunc_xor_v4i64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: xorps %xmm3, %xmm1 +; SSE-NEXT: xorps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_xor_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_xor_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_xor_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_xor_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = xor <4 x i64> %a0, %a1 + %2 = trunc <4 x i64> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { +; SSE-LABEL: trunc_xor_v8i64_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm6, %xmm2 +; SSE-NEXT: pxor %xmm7, %xmm3 +; SSE-NEXT: pxor %xmm4, %xmm0 +; SSE-NEXT: pxor %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_xor_v8i64_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_xor_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_xor_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_xor_v8i64_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = xor <8 x i64> %a0, %a1 + %2 = trunc <8 x i64> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { +; SSE-LABEL: trunc_xor_v8i32_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm3, %xmm1 +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_xor_v8i32_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_xor_v8i32_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_xor_v8i32_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = xor <8 x i32> %a0, %a1 + %2 = trunc <8 x i32> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { +; SSE-LABEL: trunc_xor_v16i64_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: packuswb %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: packuswb %xmm5, %xmm4 +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_xor_v16i64_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorpd %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vxorpd %ymm5, %ymm1, %ymm1 +; AVX1-NEXT: vxorpd %ymm6, %ymm2, %ymm2 +; AVX1-NEXT: vxorpd %ymm7, %ymm3, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm5 = mem[0,0] +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_xor_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpxor %ymm5, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpxor %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpxor %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpxor %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_xor_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpxor %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpxor %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpxor %ymm7, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpxor %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_xor_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxorq %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = xor <16 x i64> %a0, %a1 + %2 = trunc <16 x i64> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { +; SSE-LABEL: trunc_xor_v16i32_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm4, %xmm0 +; SSE-NEXT: pxor %xmm5, %xmm1 +; SSE-NEXT: pxor %xmm6, %xmm2 +; SSE-NEXT: pxor %xmm7, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_xor_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_xor_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_xor_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = xor <16 x i32> %a0, %a1 + %2 = trunc <16 x i32> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { +; SSE-LABEL: trunc_xor_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm3, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_xor_v16i16_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_xor_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_xor_v16i16_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_xor_v16i16_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_xor_v16i16_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %1 = xor <16 x i16> %a0, %a1 + %2 = trunc <16 x i16> %1 to <16 x i8> + ret <16 x i8> %2 +} + +; +; xor to constant +; + +define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind { +; SSE-LABEL: trunc_xor_const_v4i64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: xorps {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_xor_const_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_xor_const_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_xor_const_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_xor_const_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = xor <4 x i64> %a0, + %2 = trunc <4 x i64> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind { +; SSE-LABEL: trunc_xor_const_v8i64_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: xorpd {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_xor_const_v8i64_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_xor_const_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_xor_const_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_xor_const_v8i64_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = xor <8 x i64> %a0, + %2 = trunc <8 x i64> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind { +; SSE-LABEL: trunc_xor_const_v8i32_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_xor_const_v8i32_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_xor_const_v8i32_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_xor_const_v8i32_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = xor <8 x i32> %a0, + %2 = trunc <8 x i32> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { +; SSE-LABEL: trunc_xor_const_v16i64_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: packuswb %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: packuswb %xmm5, %xmm4 +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_xor_const_v16i64_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm5 = mem[0,0] +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_xor_const_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_xor_const_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_xor_const_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = xor <16 x i64> %a0, + %2 = trunc <16 x i64> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { +; SSE-LABEL: trunc_xor_const_v16i32_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_xor_const_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_xor_const_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_xor_const_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = xor <16 x i32> %a0, + %2 = trunc <16 x i32> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind { +; SSE-LABEL: trunc_xor_const_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_xor_const_v16i16_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_xor_const_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %1 = xor <16 x i16> %a0, + %2 = trunc <16 x i16> %1 to <16 x i8> + ret <16 x i8> %2 +} + +; +; or +; + +define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { +; SSE-LABEL: trunc_or_v4i64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: orps %xmm3, %xmm1 +; SSE-NEXT: orps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_or_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_or_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_or_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_or_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = or <4 x i64> %a0, %a1 + %2 = trunc <4 x i64> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { +; SSE-LABEL: trunc_or_v8i64_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_or_v8i64_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_or_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_or_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_or_v8i64_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = or <8 x i64> %a0, %a1 + %2 = trunc <8 x i64> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { +; SSE-LABEL: trunc_or_v8i32_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_or_v8i32_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_or_v8i32_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_or_v8i32_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = or <8 x i32> %a0, %a1 + %2 = trunc <8 x i32> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { +; SSE-LABEL: trunc_or_v16i64_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: packuswb %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: packuswb %xmm5, %xmm4 +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_or_v16i64_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vorpd %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vorpd %ymm5, %ymm1, %ymm1 +; AVX1-NEXT: vorpd %ymm6, %ymm2, %ymm2 +; AVX1-NEXT: vorpd %ymm7, %ymm3, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm5 = mem[0,0] +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_or_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpor %ymm5, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpor %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpor %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpor %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_or_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpor %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpor %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpor %ymm7, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpor %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_or_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vporq %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vporq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = or <16 x i64> %a0, %a1 + %2 = trunc <16 x i64> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { +; SSE-LABEL: trunc_or_v16i32_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_or_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_or_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_or_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = or <16 x i32> %a0, %a1 + %2 = trunc <16 x i32> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { +; SSE-LABEL: trunc_or_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_or_v16i16_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_or_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_or_v16i16_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_or_v16i16_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_or_v16i16_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %1 = or <16 x i16> %a0, %a1 + %2 = trunc <16 x i16> %1 to <16 x i8> + ret <16 x i8> %2 +} + +; +; or to constant +; + +define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind { +; SSE-LABEL: trunc_or_const_v4i64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: orps {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_or_const_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_or_const_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_or_const_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_or_const_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = or <4 x i64> %a0, + %2 = trunc <4 x i64> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind { +; SSE-LABEL: trunc_or_const_v8i64_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: orpd {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_or_const_v8i64_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_or_const_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_or_const_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_or_const_v8i64_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = or <8 x i64> %a0, + %2 = trunc <8 x i64> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind { +; SSE-LABEL: trunc_or_const_v8i32_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: por {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_or_const_v8i32_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_or_const_v8i32_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_or_const_v8i32_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = or <8 x i32> %a0, + %2 = trunc <8 x i32> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { +; SSE-LABEL: trunc_or_const_v16i64_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: packuswb %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: packuswb %xmm5, %xmm4 +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: por {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_or_const_v16i64_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm5 = mem[0,0] +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_or_const_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_or_const_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_or_const_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = or <16 x i64> %a0, + %2 = trunc <16 x i64> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { +; SSE-LABEL: trunc_or_const_v16i32_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: por {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_or_const_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_or_const_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_or_const_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = or <16 x i32> %a0, + %2 = trunc <16 x i32> %1 to <16 x i8> + ret <16 x i8> %2 +} + +define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind { +; SSE-LABEL: trunc_or_const_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: por {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_or_const_v16i16_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_or_const_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_or_const_v16i16_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq + %1 = or <16 x i16> %a0, + %2 = trunc <16 x i16> %1 to <16 x i8> + ret <16 x i8> %2 +} + +; +; complex patterns - often created by vectorizer +; + +define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { +; SSE-LABEL: mul_add_const_v4i64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] +; SSE-NEXT: pmuludq %xmm1, %xmm3 +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pmuludq %xmm0, %xmm1 +; SSE-NEXT: psllq $32, %xmm1 +; SSE-NEXT: paddq %xmm3, %xmm1 +; SSE-NEXT: pmuludq %xmm4, %xmm2 +; SSE-NEXT: pmuludq %xmm4, %xmm0 +; SSE-NEXT: psllq $32, %xmm0 +; SSE-NEXT: paddq %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: mul_add_const_v4i64_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = sext <4 x i32> %a0 to <4 x i64> + %2 = sext <4 x i32> %a1 to <4 x i64> + %3 = mul <4 x i64> %1, %2 + %4 = add <4 x i64> %3, + %5 = trunc <4 x i64> %4 to <4 x i32> + ret <4 x i32> %5 +} + +define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { +; SSE-LABEL: mul_add_self_v4i64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrad $31, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: psrad $31, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: psrad $31, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: psrad $31, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE-NEXT: pxor %xmm8, %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] +; SSE-NEXT: pmuludq %xmm1, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: pmuludq %xmm0, %xmm7 +; SSE-NEXT: paddq %xmm6, %xmm7 +; SSE-NEXT: psllq $32, %xmm7 +; SSE-NEXT: pmuludq %xmm0, %xmm1 +; SSE-NEXT: paddq %xmm7, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE-NEXT: pmuludq %xmm4, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] +; SSE-NEXT: pmuludq %xmm2, %xmm5 +; SSE-NEXT: paddq %xmm3, %xmm5 +; SSE-NEXT: psllq $32, %xmm5 +; SSE-NEXT: pmuludq %xmm2, %xmm4 +; SSE-NEXT: paddq %xmm5, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] +; SSE-NEXT: paddd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: mul_add_self_v4i64_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = sext <4 x i32> %a0 to <4 x i64> + %2 = sext <4 x i32> %a1 to <4 x i64> + %3 = mul <4 x i64> %1, %2 + %4 = add <4 x i64> %3, %3 + %5 = trunc <4 x i64> %4 to <4 x i32> + ret <4 x i32> %5 +} + +define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { +; SSE-LABEL: mul_add_multiuse_v4i64_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] +; SSE-NEXT: pmuludq %xmm1, %xmm3 +; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: pmuludq %xmm5, %xmm1 +; SSE-NEXT: psllq $32, %xmm1 +; SSE-NEXT: paddq %xmm3, %xmm1 +; SSE-NEXT: pmuludq %xmm4, %xmm2 +; SSE-NEXT: pmuludq %xmm4, %xmm5 +; SSE-NEXT: psllq $32, %xmm5 +; SSE-NEXT: paddq %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2] +; SSE-NEXT: paddd %xmm5, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: mul_add_multiuse_v4i64_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = sext <4 x i32> %a0 to <4 x i64> + %2 = sext <4 x i32> %a1 to <4 x i64> + %3 = mul <4 x i64> %1, %2 + %4 = add <4 x i64> %1, %3 + %5 = trunc <4 x i64> %4 to <4 x i32> + ret <4 x i32> %5 +} diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus-widen.ll b/llvm/test/CodeGen/X86/vector-trunc-packus-widen.ll new file mode 100644 index 000000000000..82904ef49fa2 --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-trunc-packus-widen.ll @@ -0,0 +1,3306 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL + +; +; PACKUS saturation truncation to vXi32 +; + +define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { +; SSE2-LABEL: trunc_packus_v4i64_v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_packus_v4i64_v4i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm0, %xmm3 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm4 +; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm1 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm0 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_packus_v4i64_v4i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4294967295,4294967295] +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm8, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647] +; SSE41-NEXT: movdqa %xmm6, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm4, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm8, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 +; SSE41-NEXT: xorpd %xmm1, %xmm1 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: xorpd %xmm8, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: movapd %xmm5, %xmm0 +; SSE41-NEXT: xorpd %xmm8, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_packus_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4294967295,4294967295] +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_packus_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_packus_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: trunc_packus_v4i64_v4i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] +; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_packus_v4i64_v4i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovusqd %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_packus_v4i64_v4i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] +; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_packus_v4i64_v4i32: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovusqd %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp slt <4 x i64> %a0, + %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> + %3 = icmp sgt <4 x i64> %2, zeroinitializer + %4 = select <4 x i1> %3, <4 x i64> %2, <4 x i64> zeroinitializer + %5 = trunc <4 x i64> %4 to <4 x i32> + ret <4 x i32> %5 +} + + +define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64> %a0) { +; SSE2-LABEL: trunc_packus_v8i64_v8i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm10, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483647,2147483647] +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm10, %xmm1 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm6 +; SSE2-NEXT: por %xmm2, %xmm6 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm10, %xmm1 +; SSE2-NEXT: movdqa %xmm9, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm10, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: pxor %xmm10, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm1 +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm10, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm0 +; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_packus_v8i64_v8i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm0, %xmm5 +; SSSE3-NEXT: pxor %xmm10, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483647,2147483647] +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: por %xmm0, %xmm5 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm0 +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm10, %xmm1 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm6 +; SSSE3-NEXT: por %xmm2, %xmm6 +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pxor %xmm10, %xmm1 +; SSSE3-NEXT: movdqa %xmm9, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm10, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm6, %xmm1 +; SSSE3-NEXT: pxor %xmm10, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm1 +; SSSE3-NEXT: pand %xmm6, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pxor %xmm10, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: pand %xmm0, %xmm3 +; SSSE3-NEXT: movdqa %xmm5, %xmm0 +; SSSE3-NEXT: pxor %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm0 +; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_packus_v8i64_v8i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: movapd {{.*#+}} xmm7 = [4294967295,4294967295] +; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm11 = [2147483647,2147483647] +; SSE41-NEXT: movdqa %xmm11, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm11, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm9 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm11, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm11, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm2, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: movapd %xmm7, %xmm0 +; SSE41-NEXT: xorpd %xmm10, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm3 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: xorpd %xmm10, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] +; SSE41-NEXT: movapd %xmm9, %xmm0 +; SSE41-NEXT: xorpd %xmm10, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm3 +; SSE41-NEXT: movapd %xmm8, %xmm0 +; SSE41-NEXT: xorpd %xmm10, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_packus_v8i64_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4294967295,4294967295] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm7, %xmm2 +; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm2 +; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_packus_v8i64_v8i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vpand %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpand %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_packus_v8i64_v8i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vpand %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpand %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_packus_v8i64_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovusqd %zmm0, %ymm0 +; AVX512-NEXT: retq + %1 = icmp slt <8 x i64> %a0, + %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> + %3 = icmp sgt <8 x i64> %2, zeroinitializer + %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer + %5 = trunc <8 x i64> %4 to <8 x i32> + ret <8 x i32> %5 +} + +; +; PACKUS saturation truncation to vXi16 +; + +define <8 x i16> @trunc_packus_v8i64_v8i16(<8 x i64> %a0) { +; SSE2-LABEL: trunc_packus_v8i64_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm10, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147549183,2147549183] +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: por %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm10, %xmm1 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm6 +; SSE2-NEXT: por %xmm3, %xmm6 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm6, %xmm2 +; SSE2-NEXT: pxor %xmm10, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm10, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm10, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_packus_v8i64_v8i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm1, %xmm5 +; SSSE3-NEXT: pxor %xmm10, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147549183,2147549183] +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: por %xmm1, %xmm5 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm10, %xmm1 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: por %xmm0, %xmm1 +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: pxor %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm6 +; SSSE3-NEXT: por %xmm3, %xmm6 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm9, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: pxor %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: movdqa %xmm6, %xmm2 +; SSSE3-NEXT: pxor %xmm10, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm2 +; SSSE3-NEXT: pand %xmm6, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pxor %xmm10, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm3, %xmm4 +; SSSE3-NEXT: pand %xmm1, %xmm4 +; SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSSE3-NEXT: pxor %xmm10, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: pand %xmm5, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_packus_v8i64_v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm9 +; SSE41-NEXT: movapd {{.*#+}} xmm7 = [65535,65535] +; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm11 = [2147549183,2147549183] +; SSE41-NEXT: movdqa %xmm11, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm8 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm11, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm11, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm11, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: movapd %xmm7, %xmm0 +; SSE41-NEXT: xorpd %xmm10, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm5 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: xorpd %xmm10, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 +; SSE41-NEXT: packusdw %xmm5, %xmm1 +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: xorpd %xmm10, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: movapd %xmm8, %xmm0 +; SSE41-NEXT: xorpd %xmm10, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3 +; SSE41-NEXT: packusdw %xmm4, %xmm3 +; SSE41-NEXT: packusdw %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_packus_v8i64_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [65535,65535] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm7, %xmm2 +; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm2 +; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v8i64_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vpand %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_packus_v8i64_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = icmp slt <8 x i64> %a0, + %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> + %3 = icmp sgt <8 x i64> %2, zeroinitializer + %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer + %5 = trunc <8 x i64> %4 to <8 x i16> + ret <8 x i16> %5 +} + +define <8 x i16> @trunc_packus_v8i32_v8i16(<8 x i32> %a0) { +; SSE2-LABEL: trunc_packus_v8i32_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm2, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_packus_v8i32_v8i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: pandn %xmm2, %xmm3 +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: pandn %xmm2, %xmm1 +; SSSE3-NEXT: por %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_packus_v8i32_v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_packus_v8i32_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v8i32_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_packus_v8i32_v8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_packus_v8i32_v8i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovusdw %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_packus_v8i32_v8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovusdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp slt <8 x i32> %a0, + %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> + %3 = icmp sgt <8 x i32> %2, zeroinitializer + %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer + %5 = trunc <8 x i32> %4 to <8 x i16> + ret <8 x i16> %5 +} + +define <16 x i16> @trunc_packus_v16i32_v16i16(<16 x i32> %a0) { +; SSE2-LABEL: trunc_packus_v16i32_v16i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535] +; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm6, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm6, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm6, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm6, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pslld $16, %xmm5 +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm5, %xmm0 +; SSE2-NEXT: pslld $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: packssdw %xmm3, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_packus_v16i32_v16i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535] +; SSSE3-NEXT: movdqa %xmm6, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pandn %xmm6, %xmm4 +; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: movdqa %xmm6, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: pandn %xmm6, %xmm5 +; SSSE3-NEXT: por %xmm0, %xmm5 +; SSSE3-NEXT: movdqa %xmm6, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm3 +; SSSE3-NEXT: pandn %xmm6, %xmm0 +; SSSE3-NEXT: por %xmm3, %xmm0 +; SSSE3-NEXT: movdqa %xmm6, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: pandn %xmm6, %xmm3 +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSSE3-NEXT: pand %xmm0, %xmm3 +; SSSE3-NEXT: movdqa %xmm5, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 +; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 +; SSSE3-NEXT: pand %xmm4, %xmm5 +; SSSE3-NEXT: pslld $16, %xmm5 +; SSSE3-NEXT: psrad $16, %xmm5 +; SSSE3-NEXT: pslld $16, %xmm0 +; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: packssdw %xmm5, %xmm0 +; SSSE3-NEXT: pslld $16, %xmm3 +; SSSE3-NEXT: psrad $16, %xmm3 +; SSSE3-NEXT: pslld $16, %xmm1 +; SSSE3-NEXT: psrad $16, %xmm1 +; SSSE3-NEXT: packssdw %xmm3, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_packus_v16i32_v16i16: +; SSE41: # %bb.0: +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_packus_v16i32_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v16i32_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_packus_v16i32_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovusdw %zmm0, %ymm0 +; AVX512-NEXT: retq + %1 = icmp slt <16 x i32> %a0, + %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> + %3 = icmp sgt <16 x i32> %2, zeroinitializer + %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer + %5 = trunc <16 x i32> %4 to <16 x i16> + ret <16 x i16> %5 +} + +; +; PACKUS saturation truncation to v16i8 +; + +define <8 x i8> @trunc_packus_v8i64_v8i8(<8 x i64> %a0) { +; SSE2-LABEL: trunc_packus_v8i64_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm10, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903] +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm11 +; SSE2-NEXT: pand %xmm11, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm11 +; SSE2-NEXT: por %xmm3, %xmm11 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm10, %xmm3 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm10, %xmm2 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm10, %xmm1 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm10, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm7 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm10, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm11, %xmm5 +; SSE2-NEXT: pxor %xmm10, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm8, %xmm6 +; SSE2-NEXT: pand %xmm11, %xmm6 +; SSE2-NEXT: pand %xmm8, %xmm4 +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: packuswb %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: pand %xmm2, %xmm7 +; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm7, %xmm0 +; SSE2-NEXT: packuswb %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_packus_v8i64_v8i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm10, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903] +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm11 +; SSSE3-NEXT: pand %xmm11, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm11 +; SSSE3-NEXT: por %xmm3, %xmm11 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pxor %xmm10, %xmm3 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm10, %xmm2 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm10, %xmm1 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: por %xmm0, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pxor %xmm10, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm7 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm10, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm4 +; SSSE3-NEXT: movdqa %xmm11, %xmm5 +; SSSE3-NEXT: pxor %xmm10, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: pand %xmm8, %xmm6 +; SSSE3-NEXT: pand %xmm11, %xmm6 +; SSSE3-NEXT: pand %xmm8, %xmm4 +; SSSE3-NEXT: pand %xmm3, %xmm4 +; SSSE3-NEXT: packuswb %xmm6, %xmm4 +; SSSE3-NEXT: pand %xmm8, %xmm7 +; SSSE3-NEXT: pand %xmm2, %xmm7 +; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: packuswb %xmm7, %xmm0 +; SSSE3-NEXT: packuswb %xmm4, %xmm0 +; SSSE3-NEXT: packuswb %xmm0, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_packus_v8i64_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm9 +; SSE41-NEXT: movapd {{.*#+}} xmm8 = [255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903] +; SSE41-NEXT: movdqa %xmm5, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm8, %xmm11 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm11 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm8, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm8, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 +; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm2, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm8, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm0 +; SSE41-NEXT: xorpd %xmm10, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm2, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: xorpd %xmm10, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm5 +; SSE41-NEXT: movapd %xmm3, %xmm0 +; SSE41-NEXT: xorpd %xmm10, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4 +; SSE41-NEXT: movapd %xmm11, %xmm0 +; SSE41-NEXT: xorpd %xmm10, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm2 +; SSE41-NEXT: andpd %xmm8, %xmm2 +; SSE41-NEXT: andpd %xmm8, %xmm4 +; SSE41-NEXT: packusdw %xmm2, %xmm4 +; SSE41-NEXT: andpd %xmm8, %xmm5 +; SSE41-NEXT: andpd %xmm8, %xmm1 +; SSE41-NEXT: packusdw %xmm5, %xmm1 +; SSE41-NEXT: packusdw %xmm4, %xmm1 +; SSE41-NEXT: packuswb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_packus_v8i64_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [255,255,255,255] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm7, %xmm2 +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpand %xmm3, %xmm7, %xmm7 +; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm8, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v8i64_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255] +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vpand %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_packus_v8i64_v8i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vmovq %xmm0, %rcx +; AVX512F-NEXT: vmovd %ecx, %xmm1 +; AVX512F-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-NEXT: vmovq %xmm2, %rax +; AVX512F-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm2, %rax +; AVX512F-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; AVX512F-NEXT: vmovq %xmm2, %rax +; AVX512F-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm2, %rax +; AVX512F-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_packus_v8i64_v8i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vmovq %xmm0, %rcx +; AVX512VL-NEXT: vmovd %ecx, %xmm1 +; AVX512VL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vmovq %xmm2, %rax +; AVX512VL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax +; AVX512VL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; AVX512VL-NEXT: vmovq %xmm2, %rax +; AVX512VL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax +; AVX512VL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_packus_v8i64_v8i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_packus_v8i64_v8i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp slt <8 x i64> %a0, + %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> + %3 = icmp sgt <8 x i64> %2, zeroinitializer + %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer + %5 = trunc <8 x i64> %4 to <8 x i8> + ret <8 x i8> %5 +} + +define void @trunc_packus_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) { +; SSE2-LABEL: trunc_packus_v8i64_v8i8_store: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm10, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903] +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm11 +; SSE2-NEXT: pand %xmm11, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm11 +; SSE2-NEXT: por %xmm3, %xmm11 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm10, %xmm3 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm10, %xmm2 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm10, %xmm1 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm10, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm7 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm10, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm11, %xmm5 +; SSE2-NEXT: pxor %xmm10, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm8, %xmm6 +; SSE2-NEXT: pand %xmm11, %xmm6 +; SSE2-NEXT: pand %xmm8, %xmm4 +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: packuswb %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: pand %xmm2, %xmm7 +; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm7, %xmm0 +; SSE2-NEXT: packuswb %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movq %xmm0, (%rdi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_packus_v8i64_v8i8_store: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm10, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903] +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm11 +; SSSE3-NEXT: pand %xmm11, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm11 +; SSSE3-NEXT: por %xmm3, %xmm11 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pxor %xmm10, %xmm3 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm10, %xmm2 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm10, %xmm1 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: por %xmm0, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pxor %xmm10, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm7 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm10, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm4 +; SSSE3-NEXT: movdqa %xmm11, %xmm5 +; SSSE3-NEXT: pxor %xmm10, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: pand %xmm8, %xmm6 +; SSSE3-NEXT: pand %xmm11, %xmm6 +; SSSE3-NEXT: pand %xmm8, %xmm4 +; SSSE3-NEXT: pand %xmm3, %xmm4 +; SSSE3-NEXT: packuswb %xmm6, %xmm4 +; SSSE3-NEXT: pand %xmm8, %xmm7 +; SSSE3-NEXT: pand %xmm2, %xmm7 +; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: packuswb %xmm7, %xmm0 +; SSSE3-NEXT: packuswb %xmm4, %xmm0 +; SSSE3-NEXT: packuswb %xmm0, %xmm0 +; SSSE3-NEXT: movq %xmm0, (%rdi) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_packus_v8i64_v8i8_store: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm9 +; SSE41-NEXT: movapd {{.*#+}} xmm8 = [255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903] +; SSE41-NEXT: movdqa %xmm5, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm8, %xmm11 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm11 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm8, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm8, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 +; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm2, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm8, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm0 +; SSE41-NEXT: xorpd %xmm10, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm2, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: xorpd %xmm10, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm5 +; SSE41-NEXT: movapd %xmm3, %xmm0 +; SSE41-NEXT: xorpd %xmm10, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4 +; SSE41-NEXT: movapd %xmm11, %xmm0 +; SSE41-NEXT: xorpd %xmm10, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm2 +; SSE41-NEXT: andpd %xmm8, %xmm2 +; SSE41-NEXT: andpd %xmm8, %xmm4 +; SSE41-NEXT: packusdw %xmm2, %xmm4 +; SSE41-NEXT: andpd %xmm8, %xmm5 +; SSE41-NEXT: andpd %xmm8, %xmm1 +; SSE41-NEXT: packusdw %xmm5, %xmm1 +; SSE41-NEXT: packusdw %xmm4, %xmm1 +; SSE41-NEXT: packuswb %xmm1, %xmm1 +; SSE41-NEXT: movq %xmm1, (%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_packus_v8i64_v8i8_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [255,255,255,255] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm7, %xmm2 +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpand %xmm3, %xmm7, %xmm7 +; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm8, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v8i64_v8i8_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255] +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vpand %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: vmovq %xmm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_packus_v8i64_v8i8_store: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovusqb %zmm0, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = icmp slt <8 x i64> %a0, + %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> + %3 = icmp sgt <8 x i64> %2, zeroinitializer + %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer + %5 = trunc <8 x i64> %4 to <8 x i8> + store <8 x i8> %5, <8 x i8> *%p1 + ret void +} + +define <16 x i8> @trunc_packus_v16i64_v16i8(<16 x i64> %a0) { +; SSE2-LABEL: trunc_packus_v16i64_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm6, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483903,2147483903] +; SSE2-NEXT: movdqa %xmm11, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm9[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm14, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm6 +; SSE2-NEXT: pandn %xmm10, %xmm9 +; SSE2-NEXT: por %xmm6, %xmm9 +; SSE2-NEXT: movdqa %xmm7, %xmm6 +; SSE2-NEXT: pxor %xmm8, %xmm6 +; SSE2-NEXT: movdqa %xmm11, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm12 +; SSE2-NEXT: pand %xmm12, %xmm7 +; SSE2-NEXT: pandn %xmm10, %xmm12 +; SSE2-NEXT: por %xmm7, %xmm12 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pxor %xmm8, %xmm6 +; SSE2-NEXT: movdqa %xmm11, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm13 +; SSE2-NEXT: pand %xmm13, %xmm4 +; SSE2-NEXT: pandn %xmm10, %xmm13 +; SSE2-NEXT: por %xmm4, %xmm13 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm11, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm14 +; SSE2-NEXT: pand %xmm14, %xmm5 +; SSE2-NEXT: pandn %xmm10, %xmm14 +; SSE2-NEXT: por %xmm5, %xmm14 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm11, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm10, %xmm5 +; SSE2-NEXT: por %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm11, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm10, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm11, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm10, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm11, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm10, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm14, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm14, %xmm2 +; SSE2-NEXT: movdqa %xmm13, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm13, %xmm1 +; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm12, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm12, %xmm3 +; SSE2-NEXT: movdqa %xmm9, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm9, %xmm4 +; SSE2-NEXT: packuswb %xmm3, %xmm4 +; SSE2-NEXT: packuswb %xmm4, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_packus_v16i64_v16i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm6, %xmm9 +; SSSE3-NEXT: pxor %xmm8, %xmm9 +; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [2147483903,2147483903] +; SSSE3-NEXT: movdqa %xmm11, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm9[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm14, %xmm9 +; SSSE3-NEXT: pand %xmm9, %xmm6 +; SSSE3-NEXT: pandn %xmm10, %xmm9 +; SSSE3-NEXT: por %xmm6, %xmm9 +; SSSE3-NEXT: movdqa %xmm7, %xmm6 +; SSSE3-NEXT: pxor %xmm8, %xmm6 +; SSSE3-NEXT: movdqa %xmm11, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm12 +; SSSE3-NEXT: pand %xmm12, %xmm7 +; SSSE3-NEXT: pandn %xmm10, %xmm12 +; SSSE3-NEXT: por %xmm7, %xmm12 +; SSSE3-NEXT: movdqa %xmm4, %xmm6 +; SSSE3-NEXT: pxor %xmm8, %xmm6 +; SSSE3-NEXT: movdqa %xmm11, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm13 +; SSSE3-NEXT: pand %xmm13, %xmm4 +; SSSE3-NEXT: pandn %xmm10, %xmm13 +; SSSE3-NEXT: por %xmm4, %xmm13 +; SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSSE3-NEXT: pxor %xmm8, %xmm4 +; SSSE3-NEXT: movdqa %xmm11, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm14 +; SSSE3-NEXT: pand %xmm14, %xmm5 +; SSSE3-NEXT: pandn %xmm10, %xmm14 +; SSSE3-NEXT: por %xmm5, %xmm14 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pxor %xmm8, %xmm4 +; SSSE3-NEXT: movdqa %xmm11, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pandn %xmm10, %xmm5 +; SSSE3-NEXT: por %xmm2, %xmm5 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: pxor %xmm8, %xmm2 +; SSSE3-NEXT: movdqa %xmm11, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pandn %xmm10, %xmm2 +; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm8, %xmm3 +; SSSE3-NEXT: movdqa %xmm11, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: pandn %xmm10, %xmm3 +; SSSE3-NEXT: por %xmm0, %xmm3 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm8, %xmm0 +; SSSE3-NEXT: movdqa %xmm11, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pandn %xmm10, %xmm4 +; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm8, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm1 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: pxor %xmm8, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm8, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSSE3-NEXT: pxor %xmm8, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: packuswb %xmm3, %xmm2 +; SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm14, %xmm1 +; SSSE3-NEXT: pxor %xmm8, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm14, %xmm2 +; SSSE3-NEXT: movdqa %xmm13, %xmm1 +; SSSE3-NEXT: pxor %xmm8, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: pand %xmm13, %xmm1 +; SSSE3-NEXT: packuswb %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm12, %xmm2 +; SSSE3-NEXT: pxor %xmm8, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: pand %xmm12, %xmm3 +; SSSE3-NEXT: movdqa %xmm9, %xmm2 +; SSSE3-NEXT: pxor %xmm8, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm4 +; SSSE3-NEXT: pand %xmm9, %xmm4 +; SSSE3-NEXT: packuswb %xmm3, %xmm4 +; SSSE3-NEXT: packuswb %xmm4, %xmm1 +; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_packus_v16i64_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: movapd {{.*#+}} xmm11 = [255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm12 = [2147483903,2147483903] +; SSE41-NEXT: movdqa %xmm12, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm10[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm12, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm13, %xmm14 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE41-NEXT: por %xmm14, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm10 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm10 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm12, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm12, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm13, %xmm14 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: por %xmm14, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm13 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm13 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm12, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm12, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm14, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm14 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm14 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm12, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm12, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm15 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm15 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm12, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm12, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm12, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm12, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm12, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm12, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm12, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm12, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm11 +; SSE41-NEXT: xorpd %xmm8, %xmm8 +; SSE41-NEXT: movapd %xmm11, %xmm0 +; SSE41-NEXT: xorpd %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm4 +; SSE41-NEXT: movapd %xmm3, %xmm0 +; SSE41-NEXT: xorpd %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: packusdw %xmm4, %xmm1 +; SSE41-NEXT: movapd %xmm6, %xmm0 +; SSE41-NEXT: xorpd %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm2 +; SSE41-NEXT: movapd %xmm5, %xmm0 +; SSE41-NEXT: xorpd %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm3 +; SSE41-NEXT: packusdw %xmm2, %xmm3 +; SSE41-NEXT: packusdw %xmm3, %xmm1 +; SSE41-NEXT: movapd %xmm15, %xmm0 +; SSE41-NEXT: xorpd %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm15, %xmm2 +; SSE41-NEXT: movapd %xmm14, %xmm0 +; SSE41-NEXT: xorpd %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm14, %xmm3 +; SSE41-NEXT: packusdw %xmm2, %xmm3 +; SSE41-NEXT: movapd %xmm13, %xmm0 +; SSE41-NEXT: xorpd %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm2 +; SSE41-NEXT: movapd %xmm10, %xmm0 +; SSE41-NEXT: xorpd %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm8 +; SSE41-NEXT: packusdw %xmm2, %xmm8 +; SSE41-NEXT: packusdw %xmm8, %xmm3 +; SSE41-NEXT: packuswb %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_packus_v16i64_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255] +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm6, %xmm7 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX1-NEXT: vblendvpd %ymm5, %ymm3, %ymm4, %ymm15 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm6, %xmm7 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX1-NEXT: vblendvpd %ymm5, %ymm2, %ymm4, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm6, %xmm7 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX1-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm6, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 +; AVX1-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm14 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm14, %xmm9 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm10 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm7, %xmm11 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm12 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm13 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm15, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm15, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm4 +; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpand %xmm15, %xmm6, %xmm4 +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpand %xmm5, %xmm13, %xmm4 +; AVX1-NEXT: vpand %xmm2, %xmm12, %xmm2 +; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm7, %xmm11, %xmm3 +; AVX1-NEXT: vpand %xmm1, %xmm10, %xmm1 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm14, %xmm9, %xmm3 +; AVX1-NEXT: vpand %xmm0, %xmm8, %xmm0 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v16i64_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm1, %ymm5 +; AVX2-NEXT: vpand %ymm1, %ymm5, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm5 +; AVX2-NEXT: vpand %ymm0, %ymm5, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm1 +; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm3 +; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_packus_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255] +; AVX512-NEXT: vpminsq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpminsq %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = icmp slt <16 x i64> %a0, + %2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> + %3 = icmp sgt <16 x i64> %2, zeroinitializer + %4 = select <16 x i1> %3, <16 x i64> %2, <16 x i64> zeroinitializer + %5 = trunc <16 x i64> %4 to <16 x i8> + ret <16 x i8> %5 +} + +define <8 x i8> @trunc_packus_v8i32_v8i8(<8 x i32> %a0) { +; SSE-LABEL: trunc_packus_v8i32_v8i8: +; SSE: # %bb.0: +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_packus_v8i32_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v8i32_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_packus_v8i32_v8i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_packus_v8i32_v8i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_packus_v8i32_v8i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp slt <8 x i32> %a0, + %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> + %3 = icmp sgt <8 x i32> %2, zeroinitializer + %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer + %5 = trunc <8 x i32> %4 to <8 x i8> + ret <8 x i8> %5 +} + +define void @trunc_packus_v8i32_v8i8_store(<8 x i32> %a0, <8 x i8> *%p1) { +; SSE-LABEL: trunc_packus_v8i32_v8i8_store: +; SSE: # %bb.0: +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movq %xmm0, (%rdi) +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_packus_v8i32_v8i8_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v8i32_v8i8_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_packus_v8i32_v8i8_store: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, (%rdi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_packus_v8i32_v8i8_store: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovusdb %ymm0, (%rdi) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_packus_v8i32_v8i8_store: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rdi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i8_store: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovusdb %ymm0, (%rdi) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp slt <8 x i32> %a0, + %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> + %3 = icmp sgt <8 x i32> %2, zeroinitializer + %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer + %5 = trunc <8 x i32> %4 to <8 x i8> + store <8 x i8> %5, <8 x i8> *%p1 + ret void +} + +define <16 x i8> @trunc_packus_v16i32_v16i8(<16 x i32> %a0) { +; SSE-LABEL: trunc_packus_v16i32_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_packus_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_packus_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovusdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = icmp slt <16 x i32> %a0, + %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> + %3 = icmp sgt <16 x i32> %2, zeroinitializer + %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer + %5 = trunc <16 x i32> %4 to <16 x i8> + ret <16 x i8> %5 +} + +define <16 x i8> @trunc_packus_v16i16_v16i8(<16 x i16> %a0) { +; SSE-LABEL: trunc_packus_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_packus_v16i16_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_packus_v16i16_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_packus_v16i16_v16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_packus_v16i16_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_packus_v16i16_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovuswb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp slt <16 x i16> %a0, + %2 = select <16 x i1> %1, <16 x i16> %a0, <16 x i16> + %3 = icmp sgt <16 x i16> %2, zeroinitializer + %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer + %5 = trunc <16 x i16> %4 to <16 x i8> + ret <16 x i8> %5 +} + +define <32 x i8> @trunc_packus_v32i16_v32i8(<32 x i16> %a0) { +; SSE-LABEL: trunc_packus_v32i16_v32i8: +; SSE: # %bb.0: +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_packus_v32i16_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v32i16_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_packus_v32i16_v32i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpminsw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpminsw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_packus_v32i16_v32i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpminsw %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpminsw %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_packus_v32i16_v32i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovuswb %zmm0, %ymm0 +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_packus_v32i16_v32i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovuswb %zmm0, %ymm0 +; AVX512BWVL-NEXT: retq + %1 = icmp slt <32 x i16> %a0, + %2 = select <32 x i1> %1, <32 x i16> %a0, <32 x i16> + %3 = icmp sgt <32 x i16> %2, zeroinitializer + %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer + %5 = trunc <32 x i16> %4 to <32 x i8> + ret <32 x i8> %5 +} diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat-widen.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat-widen.ll new file mode 100644 index 000000000000..11ed156316ab --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat-widen.ll @@ -0,0 +1,3243 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL + +; +; Signed saturation truncation to vXi32 +; + +define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) { +; SSE2-LABEL: trunc_ssat_v4i64_v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [18446744069414584320,18446744069414584320] +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: pandn %xmm1, %xmm6 +; SSE2-NEXT: por %xmm4, %xmm6 +; SSE2-NEXT: pxor %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_ssat_v4i64_v4i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647] +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm0, %xmm3 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm4 +; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [18446744069414584320,18446744069414584320] +; SSSE3-NEXT: movdqa %xmm0, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm4 +; SSSE3-NEXT: pandn %xmm1, %xmm6 +; SSSE3-NEXT: por %xmm4, %xmm6 +; SSSE3-NEXT: pxor %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm3 +; SSSE3-NEXT: pandn %xmm1, %xmm0 +; SSSE3-NEXT: por %xmm3, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_ssat_v4i64_v4i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movapd {{.*#+}} xmm4 = [2147483647,2147483647] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] +; SSE41-NEXT: movdqa %xmm8, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm4, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 +; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: xorpd %xmm3, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [18446744069414584320,18446744069414584320] +; SSE41-NEXT: movapd %xmm0, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: xorpd %xmm5, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_ssat_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2147483647,2147483647] +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744071562067968,18446744071562067968] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_ssat_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] +; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] +; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_ssat_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] +; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] +; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: trunc_ssat_v4i64_v4i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] +; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] +; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_ssat_v4i64_v4i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovsqd %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_ssat_v4i64_v4i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] +; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] +; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_ssat_v4i64_v4i32: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovsqd %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp slt <4 x i64> %a0, + %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> + %3 = icmp sgt <4 x i64> %2, + %4 = select <4 x i1> %3, <4 x i64> %2, <4 x i64> + %5 = trunc <4 x i64> %4 to <4 x i32> + ret <4 x i32> %5 +} + + +define <8 x i32> @trunc_ssat_v8i64_v8i32(<8 x i64> %a0) { +; SSE2-LABEL: trunc_ssat_v8i64_v8i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [4294967295,4294967295] +; SSE2-NEXT: movdqa %xmm9, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm7 +; SSE2-NEXT: por %xmm2, %xmm7 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm9, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm6 +; SSE2-NEXT: por %xmm3, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067968,18446744071562067968] +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744069414584320,18446744069414584320] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm6 +; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm7 +; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: por %xmm7, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm5 +; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_ssat_v8i64_v8i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647] +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm0, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [4294967295,4294967295] +; SSSE3-NEXT: movdqa %xmm9, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: por %xmm0, %xmm5 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm0 +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm7 +; SSSE3-NEXT: por %xmm2, %xmm7 +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm9, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm6 +; SSSE3-NEXT: por %xmm3, %xmm6 +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067968,18446744071562067968] +; SSSE3-NEXT: movdqa %xmm6, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [18446744069414584320,18446744069414584320] +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm6 +; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm6, %xmm2 +; SSSE3-NEXT: movdqa %xmm7, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm7 +; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: por %xmm7, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm5, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm5 +; SSSE3-NEXT: pandn %xmm8, %xmm0 +; SSSE3-NEXT: por %xmm5, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_ssat_v8i64_v8i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm9 +; SSE41-NEXT: movapd {{.*#+}} xmm7 = [2147483647,2147483647] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [4294967295,4294967295] +; SSE41-NEXT: movdqa %xmm10, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm8 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm10, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm9, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm9 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm10, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm10, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm2, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 +; SSE41-NEXT: movapd {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] +; SSE41-NEXT: movapd %xmm7, %xmm0 +; SSE41-NEXT: xorpd %xmm5, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [18446744069414584320,18446744069414584320] +; SSE41-NEXT: movapd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm3 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: xorpd %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] +; SSE41-NEXT: movapd %xmm9, %xmm0 +; SSE41-NEXT: xorpd %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm3 +; SSE41-NEXT: xorpd %xmm8, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_ssat_v8i64_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [2147483647,2147483647] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744071562067968,18446744071562067968] +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_ssat_v8i64_v8i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647] +; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] +; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_ssat_v8i64_v8i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647] +; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] +; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_ssat_v8i64_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsqd %zmm0, %ymm0 +; AVX512-NEXT: retq + %1 = icmp slt <8 x i64> %a0, + %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> + %3 = icmp sgt <8 x i64> %2, + %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> + %5 = trunc <8 x i64> %4 to <8 x i32> + ret <8 x i32> %5 +} + +; +; Signed saturation truncation to vXi16 +; + +define <8 x i16> @trunc_ssat_v8i64_v8i16(<8 x i64> %a0) { +; SSE2-LABEL: trunc_ssat_v8i64_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147516415,2147516415] +; SSE2-NEXT: movdqa %xmm9, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: por %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm7 +; SSE2-NEXT: por %xmm1, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709518848,18446744073709518848] +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562035200,18446744071562035200] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm7 +; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: por %xmm7, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm5 +; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: packssdw %xmm3, %xmm1 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_ssat_v8i64_v8i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767] +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147516415,2147516415] +; SSSE3-NEXT: movdqa %xmm9, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: por %xmm2, %xmm5 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm0, %xmm3 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm7 +; SSSE3-NEXT: por %xmm1, %xmm7 +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709518848,18446744073709518848] +; SSSE3-NEXT: movdqa %xmm7, %xmm0 +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562035200,18446744071562035200] +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm7 +; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: por %xmm7, %xmm1 +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm0 +; SSSE3-NEXT: por %xmm3, %xmm0 +; SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: pxor %xmm5, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm5 +; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: packssdw %xmm3, %xmm1 +; SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_ssat_v8i64_v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm10 +; SSE41-NEXT: movapd {{.*#+}} xmm7 = [32767,32767] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm11 = [2147516415,2147516415] +; SSE41-NEXT: movdqa %xmm11, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm8 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm11, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm9 +; SSE41-NEXT: movdqa %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm11, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm2 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm11, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 +; SSE41-NEXT: movapd {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] +; SSE41-NEXT: movapd %xmm7, %xmm0 +; SSE41-NEXT: xorpd %xmm5, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [18446744071562035200,18446744071562035200] +; SSE41-NEXT: movapd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm4 +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: xorpd %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: packssdw %xmm4, %xmm1 +; SSE41-NEXT: movapd %xmm9, %xmm0 +; SSE41-NEXT: xorpd %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm2 +; SSE41-NEXT: xorpd %xmm8, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3 +; SSE41-NEXT: packssdw %xmm2, %xmm3 +; SSE41-NEXT: packssdw %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_ssat_v8i64_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [32767,32767,32767,32767] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32767,32767] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v8i64_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [32767,32767,32767,32767] +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848] +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_ssat_v8i64_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = icmp slt <8 x i64> %a0, + %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> + %3 = icmp sgt <8 x i64> %2, + %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> + %5 = trunc <8 x i64> %4 to <8 x i16> + ret <8 x i16> %5 +} + +define <8 x i16> @trunc_ssat_v8i32_v8i16(<8 x i32> %a0) { +; SSE-LABEL: trunc_ssat_v8i32_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_ssat_v8i32_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v8i32_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_ssat_v8i32_v8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528] +; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_ssat_v8i32_v8i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovsdw %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_ssat_v8i32_v8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767] +; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528] +; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_ssat_v8i32_v8i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovsdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp slt <8 x i32> %a0, + %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> + %3 = icmp sgt <8 x i32> %2, + %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> + %5 = trunc <8 x i32> %4 to <8 x i16> + ret <8 x i16> %5 +} + +define <16 x i16> @trunc_ssat_v16i32_v16i16(<16 x i32> %a0) { +; SSE-LABEL: trunc_ssat_v16i32_v16i16: +; SSE: # %bb.0: +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_ssat_v16i32_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v16i32_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_ssat_v16i32_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsdw %zmm0, %ymm0 +; AVX512-NEXT: retq + %1 = icmp slt <16 x i32> %a0, + %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> + %3 = icmp sgt <16 x i32> %2, + %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> + %5 = trunc <16 x i32> %4 to <16 x i16> + ret <16 x i16> %5 +} + +; +; Signed saturation truncation to v16i8 +; + +define <8 x i8> @trunc_ssat_v8i64_v8i8(<8 x i64> %a0) { +; SSE2-LABEL: trunc_ssat_v8i64_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [127,127] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775] +; SSE2-NEXT: movdqa %xmm9, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: por %xmm3, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm7 +; SSE2-NEXT: por %xmm0, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488] +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm7 +; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm7 +; SSE2-NEXT: por %xmm2, %xmm7 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm5 +; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: packuswb %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm7 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: packuswb %xmm7, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_ssat_v8i64_v8i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [127,127] +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm3, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775] +; SSSE3-NEXT: movdqa %xmm9, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: por %xmm3, %xmm5 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm7 +; SSSE3-NEXT: por %xmm0, %xmm7 +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488] +; SSSE3-NEXT: movdqa %xmm7, %xmm0 +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm7 +; SSSE3-NEXT: pandn %xmm8, %xmm0 +; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm7 +; SSSE3-NEXT: por %xmm2, %xmm7 +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: pxor %xmm5, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm3, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm5 +; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: packuswb %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm3, %xmm7 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: packuswb %xmm7, %xmm0 +; SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSSE3-NEXT: packuswb %xmm0, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_ssat_v8i64_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: movapd {{.*#+}} xmm7 = [127,127] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483775,2147483775] +; SSE41-NEXT: movdqa %xmm10, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm9, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm9 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm10, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm11 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm11 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm10, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm10, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm2, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 +; SSE41-NEXT: movapd {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488] +; SSE41-NEXT: movapd %xmm7, %xmm0 +; SSE41-NEXT: xorpd %xmm5, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067840,18446744071562067840] +; SSE41-NEXT: movapd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: movapd %xmm6, %xmm0 +; SSE41-NEXT: xorpd %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 +; SSE41-NEXT: movapd %xmm11, %xmm0 +; SSE41-NEXT: xorpd %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm3 +; SSE41-NEXT: xorpd %xmm9, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm2 +; SSE41-NEXT: movapd {{.*#+}} xmm0 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE41-NEXT: andpd %xmm0, %xmm2 +; SSE41-NEXT: andpd %xmm0, %xmm3 +; SSE41-NEXT: packusdw %xmm2, %xmm3 +; SSE41-NEXT: andpd %xmm0, %xmm7 +; SSE41-NEXT: andpd %xmm0, %xmm1 +; SSE41-NEXT: packusdw %xmm7, %xmm1 +; SSE41-NEXT: packusdw %xmm3, %xmm1 +; SSE41-NEXT: packuswb %xmm1, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_ssat_v8i64_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [127,127,127,127] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vandpd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vandpd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v8i64_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [127,127,127,127] +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_ssat_v8i64_v8i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vmovq %xmm0, %rcx +; AVX512F-NEXT: vmovd %ecx, %xmm1 +; AVX512F-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-NEXT: vmovq %xmm2, %rax +; AVX512F-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm2, %rax +; AVX512F-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; AVX512F-NEXT: vmovq %xmm2, %rax +; AVX512F-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm2, %rax +; AVX512F-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_ssat_v8i64_v8i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512VL-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vmovq %xmm0, %rcx +; AVX512VL-NEXT: vmovd %ecx, %xmm1 +; AVX512VL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vmovq %xmm2, %rax +; AVX512VL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax +; AVX512VL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; AVX512VL-NEXT: vmovq %xmm2, %rax +; AVX512VL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax +; AVX512VL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_ssat_v8i64_v8i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_ssat_v8i64_v8i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp slt <8 x i64> %a0, + %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> + %3 = icmp sgt <8 x i64> %2, + %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> + %5 = trunc <8 x i64> %4 to <8 x i8> + ret <8 x i8> %5 +} + +define void @trunc_ssat_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) { +; SSE2-LABEL: trunc_ssat_v8i64_v8i8_store: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [127,127] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775] +; SSE2-NEXT: movdqa %xmm9, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: por %xmm3, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm7 +; SSE2-NEXT: por %xmm0, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488] +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm7 +; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm7 +; SSE2-NEXT: por %xmm2, %xmm7 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm5 +; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: packuswb %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm7 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: packuswb %xmm7, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movq %xmm0, (%rdi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_ssat_v8i64_v8i8_store: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [127,127] +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm3, %xmm5 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775] +; SSSE3-NEXT: movdqa %xmm9, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: por %xmm3, %xmm5 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm7 +; SSSE3-NEXT: por %xmm0, %xmm7 +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488] +; SSSE3-NEXT: movdqa %xmm7, %xmm0 +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm7 +; SSSE3-NEXT: pandn %xmm8, %xmm0 +; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm7 +; SSSE3-NEXT: por %xmm2, %xmm7 +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: pxor %xmm5, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm3, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm5 +; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: packuswb %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm3, %xmm7 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: packuswb %xmm7, %xmm0 +; SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSSE3-NEXT: packuswb %xmm0, %xmm0 +; SSSE3-NEXT: movq %xmm0, (%rdi) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_ssat_v8i64_v8i8_store: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: movapd {{.*#+}} xmm7 = [127,127] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483775,2147483775] +; SSE41-NEXT: movdqa %xmm10, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm9, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm9 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm10, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm11 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm11 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm10, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm10, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm2, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 +; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; SSE41-NEXT: movapd %xmm7, %xmm0 +; SSE41-NEXT: xorpd %xmm5, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067840,18446744071562067840] +; SSE41-NEXT: movapd %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm2 +; SSE41-NEXT: movapd %xmm6, %xmm0 +; SSE41-NEXT: xorpd %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 +; SSE41-NEXT: movapd %xmm11, %xmm0 +; SSE41-NEXT: xorpd %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm3 +; SSE41-NEXT: xorpd %xmm9, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1 +; SSE41-NEXT: movapd {{.*#+}} xmm0 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE41-NEXT: andpd %xmm0, %xmm1 +; SSE41-NEXT: andpd %xmm0, %xmm3 +; SSE41-NEXT: packusdw %xmm1, %xmm3 +; SSE41-NEXT: andpd %xmm0, %xmm7 +; SSE41-NEXT: andpd %xmm0, %xmm2 +; SSE41-NEXT: packusdw %xmm7, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: packuswb %xmm2, %xmm2 +; SSE41-NEXT: movq %xmm2, (%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_ssat_v8i64_v8i8_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [127,127,127,127] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vandpd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vandpd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v8i64_v8i8_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [127,127,127,127] +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: vmovq %xmm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_ssat_v8i64_v8i8_store: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsqb %zmm0, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = icmp slt <8 x i64> %a0, + %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> + %3 = icmp sgt <8 x i64> %2, + %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> + %5 = trunc <8 x i64> %4 to <8 x i8> + store <8 x i8> %5, <8 x i8> *%p1 + ret void +} + +define <16 x i8> @trunc_ssat_v16i64_v16i8(<16 x i64> %a0) { +; SSE2-LABEL: trunc_ssat_v16i64_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [127,127] +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm6, %xmm9 +; SSE2-NEXT: pxor %xmm8, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483775,2147483775] +; SSE2-NEXT: movdqa %xmm11, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm9[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm14, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm6 +; SSE2-NEXT: pandn %xmm10, %xmm9 +; SSE2-NEXT: por %xmm6, %xmm9 +; SSE2-NEXT: movdqa %xmm7, %xmm6 +; SSE2-NEXT: pxor %xmm8, %xmm6 +; SSE2-NEXT: movdqa %xmm11, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm12 +; SSE2-NEXT: pand %xmm12, %xmm7 +; SSE2-NEXT: pandn %xmm10, %xmm12 +; SSE2-NEXT: por %xmm7, %xmm12 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pxor %xmm8, %xmm6 +; SSE2-NEXT: movdqa %xmm11, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm13 +; SSE2-NEXT: pand %xmm13, %xmm4 +; SSE2-NEXT: pandn %xmm10, %xmm13 +; SSE2-NEXT: por %xmm4, %xmm13 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm11, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm14 +; SSE2-NEXT: pand %xmm14, %xmm5 +; SSE2-NEXT: pandn %xmm10, %xmm14 +; SSE2-NEXT: por %xmm5, %xmm14 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm11, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm10, %xmm5 +; SSE2-NEXT: por %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm11, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pandn %xmm10, %xmm6 +; SSE2-NEXT: por %xmm3, %xmm6 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm11, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm10, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm11, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm10, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [18446744073709551488,18446744073709551488] +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: pandn %xmm10, %xmm1 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm10, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm6 +; SSE2-NEXT: pandn %xmm10, %xmm2 +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm5 +; SSE2-NEXT: pandn %xmm10, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm3 +; SSE2-NEXT: packssdw %xmm2, %xmm3 +; SSE2-NEXT: packssdw %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm14, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm14 +; SSE2-NEXT: pandn %xmm10, %xmm2 +; SSE2-NEXT: por %xmm14, %xmm2 +; SSE2-NEXT: movdqa %xmm13, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm13 +; SSE2-NEXT: pandn %xmm10, %xmm3 +; SSE2-NEXT: por %xmm13, %xmm3 +; SSE2-NEXT: packssdw %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm12, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm12 +; SSE2-NEXT: pandn %xmm10, %xmm2 +; SSE2-NEXT: por %xmm12, %xmm2 +; SSE2-NEXT: pxor %xmm9, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm11, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm9 +; SSE2-NEXT: pandn %xmm10, %xmm1 +; SSE2-NEXT: por %xmm9, %xmm1 +; SSE2-NEXT: packssdw %xmm2, %xmm1 +; SSE2-NEXT: packssdw %xmm1, %xmm3 +; SSE2-NEXT: packsswb %xmm3, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_ssat_v16i64_v16i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [127,127] +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm6, %xmm9 +; SSSE3-NEXT: pxor %xmm8, %xmm9 +; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [2147483775,2147483775] +; SSSE3-NEXT: movdqa %xmm11, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm9[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm14, %xmm9 +; SSSE3-NEXT: pand %xmm9, %xmm6 +; SSSE3-NEXT: pandn %xmm10, %xmm9 +; SSSE3-NEXT: por %xmm6, %xmm9 +; SSSE3-NEXT: movdqa %xmm7, %xmm6 +; SSSE3-NEXT: pxor %xmm8, %xmm6 +; SSSE3-NEXT: movdqa %xmm11, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm12 +; SSSE3-NEXT: pand %xmm12, %xmm7 +; SSSE3-NEXT: pandn %xmm10, %xmm12 +; SSSE3-NEXT: por %xmm7, %xmm12 +; SSSE3-NEXT: movdqa %xmm4, %xmm6 +; SSSE3-NEXT: pxor %xmm8, %xmm6 +; SSSE3-NEXT: movdqa %xmm11, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm13 +; SSSE3-NEXT: pand %xmm13, %xmm4 +; SSSE3-NEXT: pandn %xmm10, %xmm13 +; SSSE3-NEXT: por %xmm4, %xmm13 +; SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSSE3-NEXT: pxor %xmm8, %xmm4 +; SSSE3-NEXT: movdqa %xmm11, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm14 +; SSSE3-NEXT: pand %xmm14, %xmm5 +; SSSE3-NEXT: pandn %xmm10, %xmm14 +; SSSE3-NEXT: por %xmm5, %xmm14 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pxor %xmm8, %xmm4 +; SSSE3-NEXT: movdqa %xmm11, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pandn %xmm10, %xmm5 +; SSSE3-NEXT: por %xmm2, %xmm5 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: pxor %xmm8, %xmm2 +; SSSE3-NEXT: movdqa %xmm11, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm3 +; SSSE3-NEXT: pandn %xmm10, %xmm6 +; SSSE3-NEXT: por %xmm3, %xmm6 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pxor %xmm8, %xmm2 +; SSSE3-NEXT: movdqa %xmm11, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: pandn %xmm10, %xmm3 +; SSSE3-NEXT: por %xmm0, %xmm3 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm8, %xmm0 +; SSSE3-NEXT: movdqa %xmm11, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pandn %xmm10, %xmm4 +; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [18446744073709551488,18446744073709551488] +; SSSE3-NEXT: movdqa %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm8, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [18446744071562067840,18446744071562067840] +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm4 +; SSSE3-NEXT: pandn %xmm10, %xmm1 +; SSSE3-NEXT: por %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: pxor %xmm8, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm3 +; SSSE3-NEXT: pandn %xmm10, %xmm0 +; SSSE3-NEXT: por %xmm3, %xmm0 +; SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm6, %xmm1 +; SSSE3-NEXT: pxor %xmm8, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm6 +; SSSE3-NEXT: pandn %xmm10, %xmm2 +; SSSE3-NEXT: por %xmm6, %xmm2 +; SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSSE3-NEXT: pxor %xmm8, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm5 +; SSSE3-NEXT: pandn %xmm10, %xmm3 +; SSSE3-NEXT: por %xmm5, %xmm3 +; SSSE3-NEXT: packssdw %xmm2, %xmm3 +; SSSE3-NEXT: packssdw %xmm3, %xmm0 +; SSSE3-NEXT: movdqa %xmm14, %xmm1 +; SSSE3-NEXT: pxor %xmm8, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm14 +; SSSE3-NEXT: pandn %xmm10, %xmm2 +; SSSE3-NEXT: por %xmm14, %xmm2 +; SSSE3-NEXT: movdqa %xmm13, %xmm1 +; SSSE3-NEXT: pxor %xmm8, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm13 +; SSSE3-NEXT: pandn %xmm10, %xmm3 +; SSSE3-NEXT: por %xmm13, %xmm3 +; SSSE3-NEXT: packssdw %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm12, %xmm1 +; SSSE3-NEXT: pxor %xmm8, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm12 +; SSSE3-NEXT: pandn %xmm10, %xmm2 +; SSSE3-NEXT: por %xmm12, %xmm2 +; SSSE3-NEXT: pxor %xmm9, %xmm8 +; SSSE3-NEXT: movdqa %xmm8, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm11, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm9 +; SSSE3-NEXT: pandn %xmm10, %xmm1 +; SSSE3-NEXT: por %xmm9, %xmm1 +; SSSE3-NEXT: packssdw %xmm2, %xmm1 +; SSSE3-NEXT: packssdw %xmm1, %xmm3 +; SSSE3-NEXT: packsswb %xmm3, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_ssat_v16i64_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: movapd {{.*#+}} xmm11 = [127,127] +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm12 = [2147483775,2147483775] +; SSE41-NEXT: movdqa %xmm12, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm10[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm12, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm13, %xmm14 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] +; SSE41-NEXT: por %xmm14, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm10 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm10 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm12, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm12, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm13, %xmm14 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: por %xmm14, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm13 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm13 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm12, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm12, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm14, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm14 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm14 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm12, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm12, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm15 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm15 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm12, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm12, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm12, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm12, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm12, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm12, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm11, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm12, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm12, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm11 +; SSE41-NEXT: movapd {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488] +; SSE41-NEXT: movapd %xmm11, %xmm0 +; SSE41-NEXT: xorpd %xmm9, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067840,18446744071562067840] +; SSE41-NEXT: movapd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm3 +; SSE41-NEXT: movapd %xmm7, %xmm0 +; SSE41-NEXT: xorpd %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm11, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: packssdw %xmm3, %xmm1 +; SSE41-NEXT: movapd %xmm6, %xmm0 +; SSE41-NEXT: xorpd %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm3 +; SSE41-NEXT: movapd %xmm5, %xmm0 +; SSE41-NEXT: xorpd %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 +; SSE41-NEXT: packssdw %xmm3, %xmm4 +; SSE41-NEXT: packssdw %xmm4, %xmm1 +; SSE41-NEXT: movapd %xmm15, %xmm0 +; SSE41-NEXT: xorpd %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm15, %xmm3 +; SSE41-NEXT: movapd %xmm14, %xmm0 +; SSE41-NEXT: xorpd %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm14, %xmm4 +; SSE41-NEXT: packssdw %xmm3, %xmm4 +; SSE41-NEXT: movapd %xmm13, %xmm0 +; SSE41-NEXT: xorpd %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm3 +; SSE41-NEXT: xorpd %xmm10, %xmm9 +; SSE41-NEXT: movapd %xmm9, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm8, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm2 +; SSE41-NEXT: packssdw %xmm3, %xmm2 +; SSE41-NEXT: packssdw %xmm2, %xmm4 +; SSE41-NEXT: packsswb %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_ssat_v16i64_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [127,127,127,127] +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [127,127] +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm6, %xmm7 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX1-NEXT: vblendvpd %ymm5, %ymm3, %ymm4, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm6, %xmm7 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX1-NEXT: vblendvpd %ymm5, %ymm2, %ymm4, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm6, %xmm7 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX1-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm6, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm0, %xmm7 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX1-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm1, %xmm7 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX1-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm7 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX1-NEXT: vblendvpd %ymm5, %ymm2, %ymm4, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-NEXT: vblendvpd %ymm5, %ymm3, %ymm4, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpackssdw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v16i64_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [127,127,127,127] +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm1, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm1 +; AVX2-NEXT: vblendvpd %ymm1, %ymm3, %ymm4, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpackssdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_ssat_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [127,127,127,127,127,127,127,127] +; AVX512-NEXT: vpminsq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpminsq %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] +; AVX512-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = icmp slt <16 x i64> %a0, + %2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> + %3 = icmp sgt <16 x i64> %2, + %4 = select <16 x i1> %3, <16 x i64> %2, <16 x i64> + %5 = trunc <16 x i64> %4 to <16 x i8> + ret <16 x i8> %5 +} + +define <8 x i8> @trunc_ssat_v8i32_v8i8(<8 x i32> %a0) { +; SSE-LABEL: trunc_ssat_v8i32_v8i8: +; SSE: # %bb.0: +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_ssat_v8i32_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v8i32_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_ssat_v8i32_v8i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] +; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_ssat_v8i32_v8i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmaxsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_ssat_v8i32_v8i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127] +; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] +; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_ssat_v8i32_v8i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmaxsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp slt <8 x i32> %a0, + %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> + %3 = icmp sgt <8 x i32> %2, + %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> + %5 = trunc <8 x i32> %4 to <8 x i8> + ret <8 x i8> %5 +} + +define void @trunc_ssat_v8i32_v8i8_store(<8 x i32> %a0, <8 x i8> *%p1) { +; SSE-LABEL: trunc_ssat_v8i32_v8i8_store: +; SSE: # %bb.0: +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: movq %xmm0, (%rdi) +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_ssat_v8i32_v8i8_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v8i32_v8i8_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_ssat_v8i32_v8i8_store: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] +; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, (%rdi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_ssat_v8i32_v8i8_store: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovsdb %ymm0, (%rdi) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_ssat_v8i32_v8i8_store: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127] +; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] +; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rdi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_ssat_v8i32_v8i8_store: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovsdb %ymm0, (%rdi) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp slt <8 x i32> %a0, + %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> + %3 = icmp sgt <8 x i32> %2, + %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> + %5 = trunc <8 x i32> %4 to <8 x i8> + store <8 x i8> %5, <8 x i8> *%p1 + ret void +} + +define <16 x i8> @trunc_ssat_v16i32_v16i8(<16 x i32> %a0) { +; SSE-LABEL: trunc_ssat_v16i32_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_ssat_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_ssat_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = icmp slt <16 x i32> %a0, + %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> + %3 = icmp sgt <16 x i32> %2, + %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> + %5 = trunc <16 x i32> %4 to <16 x i8> + ret <16 x i8> %5 +} + +define <16 x i8> @trunc_ssat_v16i16_v16i8(<16 x i16> %a0) { +; SSE-LABEL: trunc_ssat_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_ssat_v16i16_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_ssat_v16i16_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpmaxsw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_ssat_v16i16_v16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpmaxsw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_ssat_v16i16_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmaxsw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_ssat_v16i16_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovswb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp slt <16 x i16> %a0, + %2 = select <16 x i1> %1, <16 x i16> %a0, <16 x i16> + %3 = icmp sgt <16 x i16> %2, + %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> + %5 = trunc <16 x i16> %4 to <16 x i8> + ret <16 x i8> %5 +} + +define <32 x i8> @trunc_ssat_v32i16_v32i8(<32 x i16> %a0) { +; SSE-LABEL: trunc_ssat_v32i16_v32i8: +; SSE: # %bb.0: +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc_ssat_v32i16_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v32i16_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_ssat_v32i16_v32i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpminsw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpminsw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408] +; AVX512F-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_ssat_v32i16_v32i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-NEXT: vpminsw %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpminsw %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408] +; AVX512VL-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_ssat_v32i16_v32i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovswb %zmm0, %ymm0 +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_ssat_v32i16_v32i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovswb %zmm0, %ymm0 +; AVX512BWVL-NEXT: retq + %1 = icmp slt <32 x i16> %a0, + %2 = select <32 x i1> %1, <32 x i16> %a0, <32 x i16> + %3 = icmp sgt <32 x i16> %2, + %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> + %5 = trunc <32 x i16> %4 to <32 x i8> + ret <32 x i8> %5 +} diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat-widen.ll b/llvm/test/CodeGen/X86/vector-trunc-usat-widen.ll new file mode 100644 index 000000000000..3dfa844ff829 --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-trunc-usat-widen.ll @@ -0,0 +1,2587 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL + +; +; Unsigned saturation truncation to vXi32 +; + +define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) { +; SSE2-LABEL: trunc_usat_v4i64_v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pandn {{.*}}(%rip), %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pandn {{.*}}(%rip), %xmm5 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_usat_v4i64_v4i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm3, %xmm5 +; SSSE3-NEXT: pxor %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm4, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm3 +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm5 +; SSSE3-NEXT: por %xmm5, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_usat_v4i64_v4i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm3 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm6, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4294967295,4294967295] +; SSE41-NEXT: movapd {{.*#+}} xmm5 = [4294967295,429496729] +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2] +; SSE41-NEXT: movaps %xmm4, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_usat_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729] +; AVX1-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_usat_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-SLOW-NEXT: vpxor %ymm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] +; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729] +; AVX2-SLOW-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_usat_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-FAST-NEXT: vpxor %ymm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] +; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729] +; AVX2-FAST-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: trunc_usat_v4i64_v4i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] +; AVX512F-NEXT: vpcmpltuq %zmm1, %zmm0, %k1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vpmovqd %zmm1, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_usat_v4i64_v4i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpcmpltuq {{.*}}(%rip){1to4}, %ymm0, %k1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729] +; AVX512VL-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} +; AVX512VL-NEXT: vpmovqd %ymm1, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_usat_v4i64_v4i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] +; AVX512BW-NEXT: vpcmpltuq %zmm1, %zmm0, %k1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vpmovqd %zmm1, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_usat_v4i64_v4i32: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpcmpltuq {{.*}}(%rip){1to4}, %ymm0, %k1 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729] +; AVX512BWVL-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} +; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp ult <4 x i64> %a0, + %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> + %3 = trunc <4 x i64> %2 to <4 x i32> + ret <4 x i32> %3 +} + +define <8 x i32> @trunc_usat_v8i64_v8i32(<8 x i64> %a0) { +; SSE2-LABEL: trunc_usat_v8i64_v8i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: pxor %xmm5, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455] +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm4 +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm3 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm9, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm2, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] +; SSE2-NEXT: movaps %xmm3, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_usat_v8i64_v8i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] +; SSSE3-NEXT: movdqa %xmm3, %xmm7 +; SSSE3-NEXT: pxor %xmm5, %xmm7 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455] +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm4 +; SSSE3-NEXT: por %xmm3, %xmm4 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pxor %xmm5, %xmm3 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm5, %xmm2 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm4 +; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: pxor %xmm0, %xmm5 +; SSSE3-NEXT: movdqa %xmm9, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm2, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] +; SSSE3-NEXT: movaps %xmm3, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_usat_v8i64_v8i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: movapd {{.*#+}} xmm6 = [4294967295,4294967295] +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455] +; SSE41-NEXT: movdqa %xmm9, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm10, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm6, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm10, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm6, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm6, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: pxor %xmm8, %xmm7 +; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] +; SSE41-NEXT: pand %xmm1, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm6 +; SSE41-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm2[0,2] +; SSE41-NEXT: movaps %xmm6, %xmm0 +; SSE41-NEXT: movaps %xmm4, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_usat_v8i64_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372041149743103,9223372041149743103] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_usat_v8i64_v8i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm4 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] +; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_usat_v8i64_v8i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-FAST-NEXT: vpxor %ymm3, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] +; AVX2-FAST-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpxor %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc_usat_v8i64_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovusqd %zmm0, %ymm0 +; AVX512-NEXT: retq + %1 = icmp ult <8 x i64> %a0, + %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> + %3 = trunc <8 x i64> %2 to <8 x i32> + ret <8 x i32> %3 +} + +; +; Unsigned saturation truncation to vXi16 +; + +define <8 x i16> @trunc_usat_v8i64_v8i16(<8 x i64> %a0) { +; SSE2-LABEL: trunc_usat_v8i64_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm6, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002324991,9223372039002324991] +; SSE2-NEXT: movdqa %xmm9, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: por %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm6, %xmm3 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm9, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_usat_v8i64_v8i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pxor %xmm6, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002324991,9223372039002324991] +; SSSE3-NEXT: movdqa %xmm9, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: por %xmm2, %xmm5 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: pxor %xmm6, %xmm2 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm6, %xmm3 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm3, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm4 +; SSSE3-NEXT: por %xmm0, %xmm4 +; SSSE3-NEXT: pxor %xmm1, %xmm6 +; SSSE3-NEXT: movdqa %xmm9, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: pand %xmm3, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm0 +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_usat_v8i64_v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: movapd {{.*#+}} xmm5 = [65535,65535] +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002324991,9223372039002324991] +; SSE41-NEXT: movdqa %xmm9, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm10, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm5, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm10, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm5, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 +; SSE41-NEXT: packusdw %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm5, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4 +; SSE41-NEXT: pxor %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 +; SSE41-NEXT: packusdw %xmm4, %xmm5 +; SSE41-NEXT: packusdw %xmm5, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_usat_v8i64_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v8i64_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm4 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854841343,9223372036854841343,9223372036854841343,9223372036854841343] +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_usat_v8i64_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = icmp ult <8 x i64> %a0, + %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> + %3 = trunc <8 x i64> %2 to <8 x i16> + ret <8 x i16> %3 +} + +define <8 x i16> @trunc_usat_v8i32_v8i16(<8 x i32> %a0) { +; SSE2-LABEL: trunc_usat_v8i32_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pslld $16, %xmm4 +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm4, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_usat_v8i32_v8i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pxor %xmm3, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pandn %xmm2, %xmm6 +; SSSE3-NEXT: por %xmm6, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pandn %xmm2, %xmm5 +; SSSE3-NEXT: por %xmm1, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm1, %xmm5 +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_usat_v8i32_v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; SSE41-NEXT: pminud %xmm2, %xmm1 +; SSE41-NEXT: pminud %xmm2, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_usat_v8i32_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v8i32_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_usat_v8i32_v8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_usat_v8i32_v8i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovusdw %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_usat_v8i32_v8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512BW-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_usat_v8i32_v8i16: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovusdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp ult <8 x i32> %a0, + %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> + %3 = trunc <8 x i32> %2 to <8 x i16> + ret <8 x i16> %3 +} + +define <16 x i16> @trunc_usat_v16i32_v16i16(<16 x i32> %a0) { +; SSE2-LABEL: trunc_usat_v16i32_v16i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm7 +; SSE2-NEXT: pxor %xmm6, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm7 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pxor %xmm7, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm7, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm6, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm7, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 +; SSE2-NEXT: pxor %xmm5, %xmm7 +; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: por %xmm7, %xmm5 +; SSE2-NEXT: pslld $16, %xmm5 +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm5, %xmm0 +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: packssdw %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_usat_v16i32_v16i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa %xmm1, %xmm8 +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm2, %xmm7 +; SSSE3-NEXT: pxor %xmm6, %xmm7 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] +; SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm7 +; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: pxor %xmm7, %xmm1 +; SSSE3-NEXT: por %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm6, %xmm4 +; SSSE3-NEXT: movdqa %xmm5, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pxor %xmm7, %xmm2 +; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm6, %xmm3 +; SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm7, %xmm4 +; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm8, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 +; SSSE3-NEXT: pxor %xmm5, %xmm7 +; SSSE3-NEXT: pand %xmm8, %xmm5 +; SSSE3-NEXT: por %xmm7, %xmm5 +; SSSE3-NEXT: pslld $16, %xmm5 +; SSSE3-NEXT: psrad $16, %xmm5 +; SSSE3-NEXT: pslld $16, %xmm0 +; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: packssdw %xmm5, %xmm0 +; SSSE3-NEXT: pslld $16, %xmm2 +; SSSE3-NEXT: psrad $16, %xmm2 +; SSSE3-NEXT: pslld $16, %xmm1 +; SSSE3-NEXT: psrad $16, %xmm1 +; SSSE3-NEXT: packssdw %xmm2, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_usat_v16i32_v16i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] +; SSE41-NEXT: pminud %xmm4, %xmm3 +; SSE41-NEXT: pminud %xmm4, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pminud %xmm4, %xmm1 +; SSE41-NEXT: pminud %xmm4, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_usat_v16i32_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] +; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpminud %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v16i32_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpminud %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_usat_v16i32_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovusdw %zmm0, %ymm0 +; AVX512-NEXT: retq + %1 = icmp ult <16 x i32> %a0, + %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> + %3 = trunc <16 x i32> %2 to <16 x i16> + ret <16 x i16> %3 +} + +; +; Unsigned saturation truncation to v16i8 +; + +define <8 x i8> @trunc_usat_v8i64_v8i8(<8 x i64> %a0) { +; SSE2-LABEL: trunc_usat_v8i64_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: pxor %xmm6, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm9, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm4 +; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm9, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm6, %xmm1 +; SSE2-NEXT: movdqa %xmm9, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm6 +; SSE2-NEXT: movdqa %xmm9, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm8, %xmm2 +; SSE2-NEXT: pand %xmm8, %xmm1 +; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm8, %xmm4 +; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: packuswb %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_usat_v8i64_v8i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] +; SSSE3-NEXT: pxor %xmm6, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711] +; SSSE3-NEXT: movdqa %xmm9, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm4 +; SSSE3-NEXT: pandn %xmm8, %xmm0 +; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pxor %xmm6, %xmm4 +; SSSE3-NEXT: movdqa %xmm9, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm4 +; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm6, %xmm1 +; SSSE3-NEXT: movdqa %xmm9, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: por %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm3, %xmm6 +; SSSE3-NEXT: movdqa %xmm9, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: pand %xmm8, %xmm2 +; SSSE3-NEXT: pand %xmm8, %xmm1 +; SSSE3-NEXT: packuswb %xmm2, %xmm1 +; SSSE3-NEXT: pand %xmm8, %xmm4 +; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: packuswb %xmm4, %xmm0 +; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: packuswb %xmm0, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_usat_v8i64_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: movapd {{.*#+}} xmm8 = [255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711] +; SSE41-NEXT: movdqa %xmm9, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm10, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm8, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm10, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm8, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm10, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm8, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: pxor %xmm3, %xmm7 +; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSE41-NEXT: pand %xmm2, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm8, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: andpd %xmm8, %xmm2 +; SSE41-NEXT: andpd %xmm8, %xmm1 +; SSE41-NEXT: packusdw %xmm2, %xmm1 +; SSE41-NEXT: andpd %xmm8, %xmm5 +; SSE41-NEXT: andpd %xmm8, %xmm4 +; SSE41-NEXT: packusdw %xmm5, %xmm4 +; SSE41-NEXT: packusdw %xmm1, %xmm4 +; SSE41-NEXT: packuswb %xmm4, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_usat_v8i64_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [255,255,255,255] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vandpd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vandpd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v8i64_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm4 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063] +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_usat_v8i64_v8i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vmovq %xmm0, %rcx +; AVX512F-NEXT: vmovd %ecx, %xmm1 +; AVX512F-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-NEXT: vmovq %xmm2, %rax +; AVX512F-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm2, %rax +; AVX512F-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; AVX512F-NEXT: vmovq %xmm2, %rax +; AVX512F-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm2, %rax +; AVX512F-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_usat_v8i64_v8i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vmovq %xmm0, %rcx +; AVX512VL-NEXT: vmovd %ecx, %xmm1 +; AVX512VL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vmovq %xmm2, %rax +; AVX512VL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax +; AVX512VL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; AVX512VL-NEXT: vmovq %xmm2, %rax +; AVX512VL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax +; AVX512VL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_usat_v8i64_v8i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_usat_v8i64_v8i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp ult <8 x i64> %a0, + %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> + %3 = trunc <8 x i64> %2 to <8 x i8> + ret <8 x i8> %3 +} + +define void @trunc_usat_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) { +; SSE2-LABEL: trunc_usat_v8i64_v8i8_store: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm6, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm9, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm6, %xmm1 +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm6 +; SSE2-NEXT: movdqa %xmm9, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm8, %xmm2 +; SSE2-NEXT: pand %xmm8, %xmm1 +; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: packuswb %xmm0, %xmm5 +; SSE2-NEXT: packuswb %xmm1, %xmm5 +; SSE2-NEXT: packuswb %xmm5, %xmm5 +; SSE2-NEXT: movq %xmm5, (%rdi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_usat_v8i64_v8i8_store: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] +; SSSE3-NEXT: movdqa %xmm0, %xmm5 +; SSSE3-NEXT: pxor %xmm6, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711] +; SSSE3-NEXT: movdqa %xmm9, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: por %xmm0, %xmm5 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm6, %xmm0 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: pand %xmm0, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm0 +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm6, %xmm1 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: por %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm3, %xmm6 +; SSSE3-NEXT: movdqa %xmm9, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: pand %xmm8, %xmm2 +; SSSE3-NEXT: pand %xmm8, %xmm1 +; SSSE3-NEXT: packuswb %xmm2, %xmm1 +; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: pand %xmm8, %xmm5 +; SSSE3-NEXT: packuswb %xmm0, %xmm5 +; SSSE3-NEXT: packuswb %xmm1, %xmm5 +; SSSE3-NEXT: packuswb %xmm5, %xmm5 +; SSSE3-NEXT: movq %xmm5, (%rdi) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_usat_v8i64_v8i8_store: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: movapd {{.*#+}} xmm8 = [255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711] +; SSE41-NEXT: movdqa %xmm9, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm10, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm8, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm10, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm8, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm10, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm8, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: pxor %xmm3, %xmm7 +; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE41-NEXT: pand %xmm2, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm8, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: andpd %xmm8, %xmm2 +; SSE41-NEXT: andpd %xmm8, %xmm1 +; SSE41-NEXT: packusdw %xmm2, %xmm1 +; SSE41-NEXT: andpd %xmm8, %xmm4 +; SSE41-NEXT: andpd %xmm8, %xmm6 +; SSE41-NEXT: packusdw %xmm4, %xmm6 +; SSE41-NEXT: packusdw %xmm1, %xmm6 +; SSE41-NEXT: packuswb %xmm6, %xmm6 +; SSE41-NEXT: movq %xmm6, (%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_usat_v8i64_v8i8_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [255,255,255,255] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vandpd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vandpd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v8i64_v8i8_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm4 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063] +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: vmovq %xmm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_usat_v8i64_v8i8_store: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovusqb %zmm0, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = icmp ult <8 x i64> %a0, + %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> + %3 = trunc <8 x i64> %2 to <8 x i8> + store <8 x i8> %3, <8 x i8> *%p1 + ret void +} + +define <16 x i8> @trunc_usat_v16i64_v16i8(<16 x i64> %a0) { +; SSE2-LABEL: trunc_usat_v16i64_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm1, %xmm11 +; SSE2-NEXT: pxor %xmm9, %xmm11 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm10, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm11, %xmm12 +; SSE2-NEXT: pand %xmm12, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm12 +; SSE2-NEXT: por %xmm1, %xmm12 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm9, %xmm1 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm14, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm12, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm9, %xmm1 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm13, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm9, %xmm3 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm13, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: packuswb %xmm1, %xmm3 +; SSE2-NEXT: packuswb %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm9, %xmm1 +; SSE2-NEXT: movdqa %xmm10, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm5 +; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm9, %xmm1 +; SSE2-NEXT: movdqa %xmm10, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: pandn %xmm8, %xmm1 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm7, %xmm2 +; SSE2-NEXT: pxor %xmm9, %xmm2 +; SSE2-NEXT: movdqa %xmm10, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm7 +; SSE2-NEXT: pandn %xmm8, %xmm3 +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: pxor %xmm6, %xmm9 +; SSE2-NEXT: movdqa %xmm10, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm6 +; SSE2-NEXT: pandn %xmm8, %xmm2 +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_usat_v16i64_v16i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] +; SSSE3-NEXT: movdqa %xmm1, %xmm11 +; SSSE3-NEXT: pxor %xmm9, %xmm11 +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259711,9223372039002259711] +; SSSE3-NEXT: movdqa %xmm10, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm11, %xmm12 +; SSSE3-NEXT: pand %xmm12, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm12 +; SSSE3-NEXT: por %xmm1, %xmm12 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm9, %xmm1 +; SSSE3-NEXT: movdqa %xmm10, %xmm11 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,3,3] +; SSSE3-NEXT: por %xmm14, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: packuswb %xmm12, %xmm0 +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pxor %xmm9, %xmm1 +; SSSE3-NEXT: movdqa %xmm10, %xmm11 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm12, %xmm13 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,3,3] +; SSSE3-NEXT: por %xmm13, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: por %xmm3, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pxor %xmm9, %xmm3 +; SSSE3-NEXT: movdqa %xmm10, %xmm11 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm12, %xmm13 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,3,3] +; SSSE3-NEXT: por %xmm13, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: packuswb %xmm1, %xmm3 +; SSSE3-NEXT: packuswb %xmm3, %xmm0 +; SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSSE3-NEXT: pxor %xmm9, %xmm1 +; SSSE3-NEXT: movdqa %xmm10, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm5 +; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm5, %xmm2 +; SSSE3-NEXT: movdqa %xmm4, %xmm1 +; SSSE3-NEXT: pxor %xmm9, %xmm1 +; SSSE3-NEXT: movdqa %xmm10, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm11, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm4 +; SSSE3-NEXT: pandn %xmm8, %xmm1 +; SSSE3-NEXT: por %xmm4, %xmm1 +; SSSE3-NEXT: packuswb %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm7, %xmm2 +; SSSE3-NEXT: pxor %xmm9, %xmm2 +; SSSE3-NEXT: movdqa %xmm10, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm7 +; SSSE3-NEXT: pandn %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm7, %xmm3 +; SSSE3-NEXT: pxor %xmm6, %xmm9 +; SSSE3-NEXT: movdqa %xmm10, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm6 +; SSSE3-NEXT: pandn %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm6, %xmm2 +; SSSE3-NEXT: packuswb %xmm3, %xmm2 +; SSSE3-NEXT: packuswb %xmm2, %xmm1 +; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_usat_v16i64_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: movapd {{.*#+}} xmm9 = [255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002259711,9223372039002259711] +; SSE41-NEXT: movdqa %xmm11, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm12 +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm13, %xmm14 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,3,3] +; SSE41-NEXT: por %xmm14, %xmm0 +; SSE41-NEXT: movapd %xmm9, %xmm12 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm12 +; SSE41-NEXT: movdqa %xmm8, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm11, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm13, %xmm14 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm14, %xmm0 +; SSE41-NEXT: movapd %xmm9, %xmm13 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm13 +; SSE41-NEXT: packusdw %xmm12, %xmm13 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm11, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm12 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm12, %xmm0 +; SSE41-NEXT: movapd %xmm9, %xmm12 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm12 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm11, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm9, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: packusdw %xmm12, %xmm1 +; SSE41-NEXT: packusdw %xmm1, %xmm13 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm11, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm9, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm11, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm9, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: packusdw %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm11, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm11, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm9, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: pxor %xmm6, %xmm10 +; SSE41-NEXT: movdqa %xmm11, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm11, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3] +; SSE41-NEXT: pand %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm9 +; SSE41-NEXT: packusdw %xmm1, %xmm9 +; SSE41-NEXT: packusdw %xmm9, %xmm2 +; SSE41-NEXT: packuswb %xmm2, %xmm13 +; SSE41-NEXT: movdqa %xmm13, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_usat_v16i64_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovapd {{.*#+}} ymm8 = [255,255,255,255] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-NEXT: vblendvpd %ymm4, %ymm0, %ymm8, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpxor %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-NEXT: vblendvpd %ymm4, %ymm1, %ymm8, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpxor %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-NEXT: vblendvpd %ymm4, %ymm2, %ymm8, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpxor %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpxor %xmm6, %xmm3, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-NEXT: vblendvpd %ymm4, %ymm3, %ymm8, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v16i64_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm6 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063] +; AVX2-NEXT: vpcmpgtq %ymm6, %ymm7, %ymm6 +; AVX2-NEXT: vblendvpd %ymm6, %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpxor %ymm5, %ymm0, %ymm6 +; AVX2-NEXT: vpcmpgtq %ymm6, %ymm7, %ymm6 +; AVX2-NEXT: vblendvpd %ymm6, %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm5, %ymm3, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm7, %ymm1 +; AVX2-NEXT: vblendvpd %ymm1, %ymm3, %ymm4, %ymm1 +; AVX2-NEXT: vpxor %ymm5, %ymm2, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm7, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_usat_v16i64_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255] +; AVX512-NEXT: vpminuq %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vpminuq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = icmp ult <16 x i64> %a0, + %2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> + %3 = trunc <16 x i64> %2 to <16 x i8> + ret <16 x i8> %3 +} + +define <8 x i8> @trunc_usat_v8i32_v8i8(<8 x i32> %a0) { +; SSE2-LABEL: trunc_usat_v8i32_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903,2147483903,2147483903] +; SSE2-NEXT: movdqa %xmm6, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm6 +; SSE2-NEXT: por %xmm1, %xmm6 +; SSE2-NEXT: pand %xmm3, %xmm6 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: packuswb %xmm6, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_usat_v8i32_v8i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pxor %xmm3, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pandn %xmm2, %xmm6 +; SSSE3-NEXT: por %xmm6, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pandn %xmm2, %xmm5 +; SSSE3-NEXT: por %xmm1, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; SSSE3-NEXT: pshufb %xmm1, %xmm5 +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_usat_v8i32_v8i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] +; SSE41-NEXT: pminud %xmm2, %xmm0 +; SSE41-NEXT: pminud %xmm2, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; SSE41-NEXT: pshufb %xmm2, %xmm1 +; SSE41-NEXT: pshufb %xmm2, %xmm0 +; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_usat_v8i32_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255] +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v8i32_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_usat_v8i32_v8i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_usat_v8i32_v8i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_usat_v8i32_v8i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_usat_v8i32_v8i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp ult <8 x i32> %a0, + %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> + %3 = trunc <8 x i32> %2 to <8 x i8> + ret <8 x i8> %3 +} + +define void @trunc_usat_v8i32_v8i8_store(<8 x i32> %a0, <8 x i8> *%p1) { +; SSE2-LABEL: trunc_usat_v8i32_v8i8_store: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: por %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pandn %xmm2, %xmm5 +; SSE2-NEXT: por %xmm1, %xmm5 +; SSE2-NEXT: pand %xmm2, %xmm5 +; SSE2-NEXT: pand %xmm2, %xmm6 +; SSE2-NEXT: packuswb %xmm5, %xmm6 +; SSE2-NEXT: packuswb %xmm6, %xmm6 +; SSE2-NEXT: movq %xmm6, (%rdi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_usat_v8i32_v8i8_store: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pxor %xmm3, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pandn %xmm2, %xmm6 +; SSSE3-NEXT: por %xmm0, %xmm6 +; SSSE3-NEXT: pxor %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pandn %xmm2, %xmm5 +; SSSE3-NEXT: por %xmm1, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; SSSE3-NEXT: pshufb %xmm0, %xmm5 +; SSSE3-NEXT: pshufb %xmm0, %xmm6 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSSE3-NEXT: movq %xmm6, (%rdi) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_usat_v8i32_v8i8_store: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] +; SSE41-NEXT: pminud %xmm2, %xmm0 +; SSE41-NEXT: pminud %xmm2, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; SSE41-NEXT: pshufb %xmm2, %xmm1 +; SSE41-NEXT: pshufb %xmm2, %xmm0 +; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE41-NEXT: movq %xmm0, (%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_usat_v8i32_v8i8_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255] +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-NEXT: vmovq %xmm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v8i32_v8i8_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vmovq %xmm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_usat_v8i32_v8i8_store: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, (%rdi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_usat_v8i32_v8i8_store: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovusdb %ymm0, (%rdi) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_usat_v8i32_v8i8_store: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rdi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_usat_v8i32_v8i8_store: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovusdb %ymm0, (%rdi) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp ult <8 x i32> %a0, + %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> + %3 = trunc <8 x i32> %2 to <8 x i8> + store <8 x i8> %3, <8 x i8> *%p1 + ret void +} + +define <16 x i8> @trunc_usat_v16i32_v16i8(<16 x i32> %a0) { +; SSE2-LABEL: trunc_usat_v16i32_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255] +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: pxor %xmm6, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903] +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm6, %xmm1 +; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pandn %xmm8, %xmm7 +; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: packuswb %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm6, %xmm1 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm8, %xmm4 +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm8, %xmm5 +; SSE2-NEXT: por %xmm2, %xmm5 +; SSE2-NEXT: packuswb %xmm4, %xmm5 +; SSE2-NEXT: packuswb %xmm5, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_usat_v16i32_v16i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm1, %xmm7 +; SSSE3-NEXT: pxor %xmm6, %xmm7 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903] +; SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pandn %xmm8, %xmm4 +; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm6, %xmm1 +; SSSE3-NEXT: movdqa %xmm5, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm0 +; SSSE3-NEXT: pandn %xmm8, %xmm7 +; SSSE3-NEXT: por %xmm7, %xmm0 +; SSSE3-NEXT: packuswb %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSSE3-NEXT: pxor %xmm6, %xmm1 +; SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm3 +; SSSE3-NEXT: pandn %xmm8, %xmm4 +; SSSE3-NEXT: por %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm2, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pandn %xmm8, %xmm5 +; SSSE3-NEXT: por %xmm2, %xmm5 +; SSSE3-NEXT: packuswb %xmm4, %xmm5 +; SSSE3-NEXT: packuswb %xmm5, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_usat_v16i32_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255] +; SSE41-NEXT: pminud %xmm4, %xmm1 +; SSE41-NEXT: pminud %xmm4, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: pminud %xmm4, %xmm3 +; SSE41-NEXT: pminud %xmm4, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_usat_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255] +; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpminud %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpminud %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_usat_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovusdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = icmp ult <16 x i32> %a0, + %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> + %3 = trunc <16 x i32> %2 to <16 x i8> + ret <16 x i8> %3 +} + +define <16 x i8> @trunc_usat_v16i16_v16i8(<16 x i16> %a0) { +; SSE2-LABEL: trunc_usat_v16i16_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [33023,33023,33023,33023,33023,33023,33023,33023] +; SSE2-NEXT: pminsw %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pminsw %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_usat_v16i16_v16i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSSE3-NEXT: pxor %xmm2, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [33023,33023,33023,33023,33023,33023,33023,33023] +; SSSE3-NEXT: pminsw %xmm3, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: pminsw %xmm3, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_usat_v16i16_v16i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pminuw %xmm2, %xmm1 +; SSE41-NEXT: pminuw %xmm2, %xmm0 +; SSE41-NEXT: packuswb %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_usat_v16i16_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpminuw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_usat_v16i16_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_usat_v16i16_v16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_usat_v16i16_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_usat_v16i16_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovuswb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = icmp ult <16 x i16> %a0, + %2 = select <16 x i1> %1, <16 x i16> %a0, <16 x i16> + %3 = trunc <16 x i16> %2 to <16 x i8> + ret <16 x i8> %3 +} + +define <32 x i8> @trunc_usat_v32i16_v32i8(<32 x i16> %a0) { +; SSE2-LABEL: trunc_usat_v32i16_v32i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [33023,33023,33023,33023,33023,33023,33023,33023] +; SSE2-NEXT: pminsw %xmm5, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pminsw %xmm5, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pminsw %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: pminsw %xmm5, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_usat_v32i16_v32i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [33023,33023,33023,33023,33023,33023,33023,33023] +; SSSE3-NEXT: pminsw %xmm5, %xmm3 +; SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: pminsw %xmm5, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: packuswb %xmm3, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: pminsw %xmm5, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: pminsw %xmm5, %xmm0 +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_usat_v32i16_v32i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pminuw %xmm4, %xmm3 +; SSE41-NEXT: pminuw %xmm4, %xmm2 +; SSE41-NEXT: packuswb %xmm3, %xmm2 +; SSE41-NEXT: pminuw %xmm4, %xmm1 +; SSE41-NEXT: pminuw %xmm4, %xmm0 +; SSE41-NEXT: packuswb %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc_usat_v32i16_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpminuw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpminuw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpminuw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpminuw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v32i16_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpminuw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpminuw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc_usat_v32i16_v32i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpminuw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpminuw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_usat_v32i16_v32i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpminuw %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpminuw %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_usat_v32i16_v32i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovuswb %zmm0, %ymm0 +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_usat_v32i16_v32i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovuswb %zmm0, %ymm0 +; AVX512BWVL-NEXT: retq + %1 = icmp ult <32 x i16> %a0, + %2 = select <32 x i1> %1, <32 x i16> %a0, <32 x i16> + %3 = trunc <32 x i16> %2 to <32 x i8> + ret <32 x i8> %3 +} diff --git a/llvm/test/CodeGen/X86/vector-trunc-widen.ll b/llvm/test/CodeGen/X86/vector-trunc-widen.ll new file mode 100644 index 000000000000..e05bdf753737 --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-trunc-widen.ll @@ -0,0 +1,2223 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL + +define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) { +; SSE-LABEL: trunc8i64_8i32: +; SSE: # %bb.0: # %entry +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc8i64_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc8i64_8i32: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc8i64_8i32: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc8i64_8i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: retq +entry: + %0 = trunc <8 x i64> %a to <8 x i32> + ret <8 x i32> %0 +} + +define <8 x i32> @trunc8i64_8i32_ashr(<8 x i64> %a) { +; SSE-LABEL: trunc8i64_8i32_ashr: +; SSE: # %bb.0: # %entry +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc8i64_8i32_ashr: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc8i64_8i32_ashr: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc8i64_8i32_ashr: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc8i64_8i32_ashr: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpsraq $32, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: retq +entry: + %0 = ashr <8 x i64> %a, + %1 = trunc <8 x i64> %0 to <8 x i32> + ret <8 x i32> %1 +} + +define <8 x i32> @trunc8i64_8i32_lshr(<8 x i64> %a) { +; SSE-LABEL: trunc8i64_8i32_lshr: +; SSE: # %bb.0: # %entry +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc8i64_8i32_lshr: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc8i64_8i32_lshr: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc8i64_8i32_lshr: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: retq +entry: + %0 = lshr <8 x i64> %a, + %1 = trunc <8 x i64> %0 to <8 x i32> + ret <8 x i32> %1 +} + +define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) { +; SSE2-LABEL: trunc8i64_8i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc8i64_8i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc8i64_8i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: packusdw %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc8i64_8i16: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc8i64_8i16: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc8i64_8i16: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc8i64_8i16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %0 = trunc <8 x i64> %a to <8 x i16> + ret <8 x i16> %0 +} + +define void @trunc8i64_8i8(<8 x i64> %a) { +; SSE2-LABEL: trunc8i64_8i8: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movq %xmm0, (%rax) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc8i64_8i8: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSSE3-NEXT: pand %xmm4, %xmm3 +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: packuswb %xmm3, %xmm2 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSSE3-NEXT: packuswb %xmm0, %xmm0 +; SSSE3-NEXT: movq %xmm0, (%rax) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc8i64_8i8: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: pand %xmm4, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pand %xmm4, %xmm1 +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: packusdw %xmm2, %xmm0 +; SSE41-NEXT: packuswb %xmm0, %xmm0 +; SSE41-NEXT: movq %xmm0, (%rax) +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc8i64_8i8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vandpd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vandpd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rax) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc8i64_8i8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc8i64_8i8: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovqb %zmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %0 = trunc <8 x i64> %a to <8 x i8> + store <8 x i8> %0, <8 x i8>* undef, align 4 + ret void +} + +define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) { +; SSE2-LABEL: trunc8i32_8i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc8i32_8i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc8i32_8i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE41-NEXT: pshufb %xmm2, %xmm1 +; SSE41-NEXT: pshufb %xmm2, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc8i32_8i16: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc8i32_8i16: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc8i32_8i16: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc8i32_8i16: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc8i32_8i16: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc8i32_8i16: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +entry: + %0 = trunc <8 x i32> %a to <8 x i16> + ret <8 x i16> %0 +} + +define <8 x i16> @trunc8i32_8i16_ashr(<8 x i32> %a) { +; SSE-LABEL: trunc8i32_8i16_ashr: +; SSE: # %bb.0: # %entry +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc8i32_8i16_ashr: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 +; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc8i32_8i16_ashr: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc8i32_8i16_ashr: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vpsrad $16, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc8i32_8i16_ashr: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vpsrad $16, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc8i32_8i16_ashr: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vpsrad $16, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc8i32_8i16_ashr: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: vpsrad $16, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +entry: + %0 = ashr <8 x i32> %a, + %1 = trunc <8 x i32> %0 to <8 x i16> + ret <8 x i16> %1 +} + +define <8 x i16> @trunc8i32_8i16_lshr(<8 x i32> %a) { +; SSE2-LABEL: trunc8i32_8i16_lshr: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc8i32_8i16_lshr: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,10,11,14,15,14,15,255,255] +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc8i32_8i16_lshr: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc8i32_8i16_lshr: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc8i32_8i16_lshr: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc8i32_8i16_lshr: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc8i32_8i16_lshr: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc8i32_8i16_lshr: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc8i32_8i16_lshr: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +entry: + %0 = lshr <8 x i32> %a, + %1 = trunc <8 x i32> %0 to <8 x i16> + ret <8 x i16> %1 +} + +define void @trunc8i32_8i8(<8 x i32> %a) { +; SSE2-LABEL: trunc8i32_8i8: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movq %xmm0, (%rax) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc8i32_8i8: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: movq %xmm0, (%rax) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc8i32_8i8: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; SSE41-NEXT: pshufb %xmm2, %xmm1 +; SSE41-NEXT: pshufb %xmm2, %xmm0 +; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE41-NEXT: movq %xmm0, (%rax) +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc8i32_8i8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vmovq %xmm0, (%rax) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc8i32_8i8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc8i32_8i8: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, (%rax) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc8i32_8i8: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vpmovdb %ymm0, (%rax) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc8i32_8i8: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rax) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc8i32_8i8: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rax) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +entry: + %0 = trunc <8 x i32> %a to <8 x i8> + store <8 x i8> %0, <8 x i8>* undef, align 4 + ret void +} + +define void @trunc16i32_16i16(<16 x i32> %a) { +; SSE2-LABEL: trunc16i32_16i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: pslld $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: movdqu %xmm2, (%rax) +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc16i32_16i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pslld $16, %xmm1 +; SSSE3-NEXT: psrad $16, %xmm1 +; SSSE3-NEXT: pslld $16, %xmm0 +; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSSE3-NEXT: pslld $16, %xmm3 +; SSSE3-NEXT: psrad $16, %xmm3 +; SSSE3-NEXT: pslld $16, %xmm2 +; SSSE3-NEXT: psrad $16, %xmm2 +; SSSE3-NEXT: packssdw %xmm3, %xmm2 +; SSSE3-NEXT: movdqu %xmm2, (%rax) +; SSSE3-NEXT: movdqu %xmm0, (%rax) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc16i32_16i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7] +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: movdqu %xmm2, (%rax) +; SSE41-NEXT: movdqu %xmm0, (%rax) +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc16i32_16i16: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovups %ymm0, (%rax) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc16i32_16i16: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc16i32_16i16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovdw %zmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %0 = trunc <16 x i32> %a to <16 x i16> + store <16 x i16> %0, <16 x i16>* undef, align 4 + ret void +} + +define void @trunc16i32_16i16_ashr(<16 x i32> %a) { +; SSE-LABEL: trunc16i32_16i16_ashr: +; SSE: # %bb.0: # %entry +; SSE-NEXT: psrad $16, %xmm3 +; SSE-NEXT: psrad $16, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: movdqu %xmm2, (%rax) +; SSE-NEXT: movdqu %xmm0, (%rax) +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc16i32_16i16_ashr: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2 +; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2 +; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovups %ymm0, (%rax) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc16i32_16i16_ashr: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpsrad $16, %ymm1, %ymm1 +; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc16i32_16i16_ashr: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpsrld $16, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdw %zmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %0 = ashr <16 x i32> %a, + %1 = trunc <16 x i32> %0 to <16 x i16> + store <16 x i16> %1, <16 x i16>* undef, align 4 + ret void +} + +define void @trunc16i32_16i16_lshr(<16 x i32> %a) { +; SSE2-LABEL: trunc16i32_16i16_lshr: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: psrld $16, %xmm2 +; SSE2-NEXT: psrld $16, %xmm3 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: pslld $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: movdqu %xmm2, (%rax) +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc16i32_16i16_lshr: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: psrld $16, %xmm2 +; SSSE3-NEXT: psrld $16, %xmm3 +; SSSE3-NEXT: psrld $16, %xmm0 +; SSSE3-NEXT: psrld $16, %xmm1 +; SSSE3-NEXT: pslld $16, %xmm1 +; SSSE3-NEXT: psrad $16, %xmm1 +; SSSE3-NEXT: pslld $16, %xmm0 +; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSSE3-NEXT: pslld $16, %xmm3 +; SSSE3-NEXT: psrad $16, %xmm3 +; SSSE3-NEXT: pslld $16, %xmm2 +; SSSE3-NEXT: psrad $16, %xmm2 +; SSSE3-NEXT: packssdw %xmm3, %xmm2 +; SSSE3-NEXT: movdqu %xmm2, (%rax) +; SSSE3-NEXT: movdqu %xmm0, (%rax) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc16i32_16i16_lshr: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: psrld $16, %xmm3 +; SSE41-NEXT: psrld $16, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: movdqu %xmm2, (%rax) +; SSE41-NEXT: movdqu %xmm0, (%rax) +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc16i32_16i16_lshr: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovups %ymm0, (%rax) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc16i32_16i16_lshr: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc16i32_16i16_lshr: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpsrld $16, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdw %zmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %0 = lshr <16 x i32> %a, + %1 = trunc <16 x i32> %0 to <16 x i16> + store <16 x i16> %1, <16 x i16>* undef, align 4 + ret void +} + +define void @trunc16i32_16i8(<16 x i32> %a) { +; SSE2-LABEL: trunc16i32_16i8: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc16i32_16i8: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSSE3-NEXT: pand %xmm4, %xmm3 +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: packuswb %xmm3, %xmm2 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSSE3-NEXT: movdqu %xmm0, (%rax) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc16i32_16i8: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: pand %xmm4, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pand %xmm4, %xmm1 +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: movdqu %xmm0, (%rax) +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc16i32_16i8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc16i32_16i8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc16i32_16i8: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovdb %zmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %0 = trunc <16 x i32> %a to <16 x i8> + store <16 x i8> %0, <16 x i8>* undef, align 4 + ret void +} + +define void @trunc16i32_16i8_ashr(<16 x i32> %a) { +; SSE-LABEL: trunc16i32_16i8_ashr: +; SSE: # %bb.0: # %entry +; SSE-NEXT: psrad $24, %xmm1 +; SSE-NEXT: psrad $24, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: psrad $24, %xmm3 +; SSE-NEXT: psrad $24, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: packsswb %xmm2, %xmm0 +; SSE-NEXT: movdqu %xmm0, (%rax) +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc16i32_16i8_ashr: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2 +; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2 +; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc16i32_16i8_ashr: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpsrad $24, %ymm1, %ymm1 +; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc16i32_16i8_ashr: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpsrld $24, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %0 = ashr <16 x i32> %a, + %1 = trunc <16 x i32> %0 to <16 x i8> + store <16 x i8> %1, <16 x i8>* undef, align 4 + ret void +} + +define void @trunc16i32_16i8_lshr(<16 x i32> %a) { +; SSE2-LABEL: trunc16i32_16i8_lshr: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: psrld $24, %xmm1 +; SSE2-NEXT: psrld $24, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: psrld $24, %xmm3 +; SSE2-NEXT: psrld $24, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc16i32_16i8_lshr: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: psrld $24, %xmm1 +; SSSE3-NEXT: psrld $24, %xmm0 +; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: psrld $24, %xmm3 +; SSSE3-NEXT: psrld $24, %xmm2 +; SSSE3-NEXT: packuswb %xmm3, %xmm2 +; SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSSE3-NEXT: movdqu %xmm0, (%rax) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc16i32_16i8_lshr: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: psrld $24, %xmm1 +; SSE41-NEXT: psrld $24, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: psrld $24, %xmm3 +; SSE41-NEXT: psrld $24, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: movdqu %xmm0, (%rax) +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc16i32_16i8_lshr: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 +; AVX1-NEXT: vpsrld $24, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 +; AVX1-NEXT: vpsrld $24, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc16i32_16i8_lshr: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpsrld $24, %ymm1, %ymm1 +; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc16i32_16i8_lshr: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpsrld $24, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %0 = lshr <16 x i32> %a, + %1 = trunc <16 x i32> %0 to <16 x i8> + store <16 x i8> %1, <16 x i8>* undef, align 4 + ret void +} + +;PR25684 +define void @trunc16i16_16i8(<16 x i16> %a) { +; SSE2-LABEL: trunc16i16_16i8: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc16i16_16i8: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: movdqu %xmm0, (%rax) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc16i16_16i8: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; SSE41-NEXT: pshufb %xmm2, %xmm1 +; SSE41-NEXT: pshufb %xmm2, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: movdqu %xmm0, (%rax) +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc16i16_16i8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc16i16_16i8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc16i16_16i8: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, (%rax) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc16i16_16i8: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, (%rax) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc16i16_16i8: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc16i16_16i8: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +entry: + %0 = trunc <16 x i16> %a to <16 x i8> + store <16 x i8> %0, <16 x i8>* undef, align 4 + ret void +} + +define void @trunc16i16_16i8_ashr(<16 x i16> %a) { +; SSE-LABEL: trunc16i16_16i8_ashr: +; SSE: # %bb.0: # %entry +; SSE-NEXT: psraw $8, %xmm1 +; SSE-NEXT: psraw $8, %xmm0 +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: movdqu %xmm0, (%rax) +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc16i16_16i8_ashr: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc16i16_16i8_ashr: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpsraw $8, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc16i16_16i8_ashr: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vpsraw $8, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, (%rax) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc16i16_16i8_ashr: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vpsraw $8, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, (%rax) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc16i16_16i8_ashr: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vpsraw $8, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc16i16_16i8_ashr: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +entry: + %0 = ashr <16 x i16> %a, + %1 = trunc <16 x i16> %0 to <16 x i8> + store <16 x i8> %1, <16 x i8>* undef, align 4 + ret void +} + +define void @trunc16i16_16i8_lshr(<16 x i16> %a) { +; SSE-LABEL: trunc16i16_16i8_lshr: +; SSE: # %bb.0: # %entry +; SSE-NEXT: psrlw $8, %xmm1 +; SSE-NEXT: psrlw $8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movdqu %xmm0, (%rax) +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc16i16_16i8_lshr: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc16i16_16i8_lshr: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc16i16_16i8_lshr: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, (%rax) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc16i16_16i8_lshr: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, (%rax) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc16i16_16i8_lshr: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc16i16_16i8_lshr: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +entry: + %0 = lshr <16 x i16> %a, + %1 = trunc <16 x i16> %0 to <16 x i8> + store <16 x i8> %1, <16 x i8>* undef, align 4 + ret void +} + +define void @trunc32i16_32i8(<32 x i16> %a) { +; SSE2-LABEL: trunc32i16_32i8: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: movdqu %xmm2, (%rax) +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc32i16_32i8: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; SSSE3-NEXT: pshufb %xmm4, %xmm1 +; SSSE3-NEXT: pshufb %xmm4, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: pshufb %xmm4, %xmm3 +; SSSE3-NEXT: pshufb %xmm4, %xmm2 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSSE3-NEXT: movdqu %xmm2, (%rax) +; SSSE3-NEXT: movdqu %xmm0, (%rax) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc32i16_32i8: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; SSE41-NEXT: pshufb %xmm4, %xmm1 +; SSE41-NEXT: pshufb %xmm4, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: pshufb %xmm4, %xmm3 +; SSE41-NEXT: pshufb %xmm4, %xmm2 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE41-NEXT: movdqu %xmm2, (%rax) +; SSE41-NEXT: movdqu %xmm0, (%rax) +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc32i16_32i8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovups %ymm0, (%rax) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc32i16_32i8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc32i16_32i8: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc32i16_32i8: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqu %ymm0, (%rax) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc32i16_32i8: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vpmovwb %zmm0, (%rax) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc32i16_32i8: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rax) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +entry: + %0 = trunc <32 x i16> %a to <32 x i8> + store <32 x i8> %0, <32 x i8>* undef, align 4 + ret void +} + +define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) { +; SSE-LABEL: trunc2x4i64_8i32: +; SSE: # %bb.0: # %entry +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc2x4i64_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc2x4i64_8i32: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc2x4i64_8i32: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: trunc2x4i64_8i32: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc2x4i64_8i32: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512VL-NEXT: vpmovqd %ymm1, %xmm1 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc2x4i64_8i32: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc2x4i64_8i32: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm1 +; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: retq +entry: + %0 = trunc <4 x i64> %a to <4 x i32> + %1 = trunc <4 x i64> %b to <4 x i32> + %2 = shufflevector <4 x i32> %0, <4 x i32> %1, <8 x i32> + ret <8 x i32> %2 +} + +define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) { +; SSE2-LABEL: trunc2x4i64_8i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc2x4i64_8i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc2x4i64_8i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE41-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; SSE41-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc2x4i64_8i16: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc2x4i64_8i16: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc2x4i64_8i16: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: trunc2x4i64_8i16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpmovqw %zmm1, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %0 = trunc <4 x i64> %a to <4 x i16> + %1 = trunc <4 x i64> %b to <4 x i16> + %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> + ret <8 x i16> %2 +} + +define <4 x i32> @trunc2x2i64_4i32(<2 x i64> %a, <2 x i64> %b) { +; SSE-LABEL: trunc2x2i64_4i32: +; SSE: # %bb.0: # %entry +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc2x2i64_4i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc2x2i64_4i32: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc2x2i64_4i32: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: trunc2x2i64_4i32: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc2x2i64_4i32: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512VL-NEXT: vpmovqd %ymm1, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc2x2i64_4i32: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc2x2i64_4i32: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm1 +; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +entry: + %0 = trunc <2 x i64> %a to <2 x i32> + %1 = trunc <2 x i64> %b to <2 x i32> + %2 = shufflevector <2 x i32> %0, <2 x i32> %1, <4 x i32> + ret <4 x i32> %2 +} + +define i64 @trunc2i64_i64(<2 x i64> %inval) { +; SSE-LABEL: trunc2i64_i64: +; SSE: # %bb.0: # %entry +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: retq +; +; AVX1-LABEL: trunc2i64_i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc2i64_i64: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vmovq %xmm0, %rax +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc2i64_i64: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovq %xmm0, %rax +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: trunc2i64_i64: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc2i64_i64: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc2i64_i64: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc2i64_i64: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, %rax +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +entry: + %0 = trunc <2 x i64> %inval to <2 x i32> + %1 = bitcast <2 x i32> %0 to i64 + ret i64 %1 +} + +define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: trunc2x4i32_8i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc2x4i32_8i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc2x4i32_8i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE41-NEXT: pshufb %xmm2, %xmm1 +; SSE41-NEXT: pshufb %xmm2, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: retq +; +; AVX1-LABEL: trunc2x4i32_8i16: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc2x4i32_8i16: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: trunc2x4i32_8i16: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc2x4i32_8i16: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vpmovdw %ymm1, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc2x4i32_8i16: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc2x4i32_8i16: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovdw %ymm1, %xmm1 +; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +entry: + %0 = trunc <4 x i32> %a to <4 x i16> + %1 = trunc <4 x i32> %b to <4 x i16> + %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> + ret <8 x i16> %2 +} + +; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524 +define i64 @trunc4i32_i64(<4 x i32> %inval) { +; SSE2-LABEL: trunc4i32_i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc4i32_i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: movq %xmm0, %rax +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc4i32_i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: retq +; +; AVX-LABEL: trunc4i32_i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: retq +; +; AVX512F-LABEL: trunc4i32_i64: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc4i32_i64: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc4i32_i64: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc4i32_i64: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, %rax +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +entry: + %0 = trunc <4 x i32> %inval to <4 x i16> + %1 = bitcast <4 x i16> %0 to i64 + ret i64 %1 +} + +define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: trunc2x8i16_16i8: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc2x8i16_16i8: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc2x8i16_16i8: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; SSE41-NEXT: pshufb %xmm2, %xmm1 +; SSE41-NEXT: pshufb %xmm2, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: retq +; +; AVX-LABEL: trunc2x8i16_16i8: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq +; +; AVX512F-LABEL: trunc2x8i16_16i8: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc2x8i16_16i8: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc2x8i16_16i8: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc2x8i16_16i8: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovwb %ymm1, %xmm1 +; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +entry: + %0 = trunc <8 x i16> %a to <8 x i8> + %1 = trunc <8 x i16> %b to <8 x i8> + %2 = shufflevector <8 x i8> %0, <8 x i8> %1, <16 x i32> + ret <16 x i8> %2 +} + +; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524 +define i64 @trunc8i16_i64(<8 x i16> %inval) { +; SSE2-LABEL: trunc8i16_i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc8i16_i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: movq %xmm0, %rax +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc8i16_i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: retq +; +; AVX-LABEL: trunc8i16_i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: retq +; +; AVX512F-LABEL: trunc8i16_i64: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc8i16_i64: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc8i16_i64: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc8i16_i64: +; AVX512BWVL: # %bb.0: # %entry +; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, %rax +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +entry: + %0 = trunc <8 x i16> %inval to <8 x i8> + %1 = bitcast <8 x i8> %0 to i64 + ret i64 %1 +} + +define <16 x i8> @trunc16i64_16i8_const() { +; SSE-LABEL: trunc16i64_16i8_const: +; SSE: # %bb.0: # %entry +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: trunc16i64_16i8_const: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: trunc16i64_16i8_const: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: retq + +entry: + %0 = trunc <16 x i64> zeroinitializer to <16 x i8> + %1 = shufflevector <16 x i8> %0, <16 x i8> %0, <16 x i32> + ret <16 x i8> %1 +} + +define <8 x i16> @PR32160(<8 x i32> %x) { +; SSE-LABEL: PR32160: +; SSE: # %bb.0: +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: retq +; +; AVX1-LABEL: PR32160: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: PR32160: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: PR32160: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: PR32160: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7] +; AVX512F-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: PR32160: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5] +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: PR32160: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5] +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: PR32160: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5] +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %shuf = trunc <8 x i32> %x to <8 x i16> + %trunc = shufflevector <8 x i16> %shuf, <8 x i16> undef, <8 x i32> + ret <8 x i16> %trunc +} + +define void @PR34773(i16* %a0, i8* %a1) { +; SSE-LABEL: PR34773: +; SSE: # %bb.0: +; SSE-NEXT: movdqu (%rdi), %xmm0 +; SSE-NEXT: movdqu 16(%rdi), %xmm1 +; SSE-NEXT: movdqu 32(%rdi), %xmm2 +; SSE-NEXT: movdqu 48(%rdi), %xmm3 +; SSE-NEXT: psrlw $8, %xmm1 +; SSE-NEXT: psrlw $8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: psrlw $8, %xmm3 +; SSE-NEXT: psrlw $8, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: movdqu %xmm0, (%rsi) +; SSE-NEXT: movdqu %xmm2, 16(%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: PR34773: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqu (%rdi), %xmm0 +; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqu 32(%rdi), %xmm2 +; AVX1-NEXT: vmovdqu 48(%rdi), %xmm3 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vmovdqu %xmm0, (%rsi) +; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: PR34773: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 +; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqu %xmm0, (%rsi) +; AVX2-NEXT: vmovdqu %xmm1, 16(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: PR34773: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqu 32(%rdi), %ymm1 +; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, (%rsi) +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovdb %zmm0, 16(%rsi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: PR34773: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512VL-NEXT: vmovdqu 32(%rdi), %ymm1 +; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi) +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vpmovdb %zmm0, 16(%rsi) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: PR34773: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqu 32(%rdi), %ymm1 +; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 +; AVX512BW-NEXT: vmovdqu %xmm0, (%rsi) +; AVX512BW-NEXT: vmovdqu %xmm1, 16(%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: PR34773: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %ymm0 +; AVX512BWVL-NEXT: vpsrlw $8, 32(%rdi), %ymm1 +; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi) +; AVX512BWVL-NEXT: vpmovwb %ymm1, 16(%rsi) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %1 = getelementptr i16, i16* %a0, i64 16 + %2 = getelementptr i8, i8* %a1, i64 16 + %3 = bitcast i16* %a0 to <16 x i16>* + %4 = bitcast i16* %1 to <16 x i16>* + %5 = bitcast i8* %a1 to <16 x i8>* + %6 = bitcast i8* %2 to <16 x i8>* + %7 = load <16 x i16>, <16 x i16>* %3, align 2 + %8 = load <16 x i16>, <16 x i16>* %4, align 2 + %9 = lshr <16 x i16> %7, + %10 = lshr <16 x i16> %8, + %11 = trunc <16 x i16> %9 to <16 x i8> + %12 = trunc <16 x i16> %10 to <16 x i8> + store <16 x i8> %11, <16 x i8>* %5, align 1 + store <16 x i8> %12, <16 x i8>* %6, align 1 + ret void +} diff --git a/llvm/test/CodeGen/X86/vector-zext-widen.ll b/llvm/test/CodeGen/X86/vector-zext-widen.ll new file mode 100644 index 000000000000..853be12f4e8c --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-zext-widen.ll @@ -0,0 +1,2294 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW + +define <8 x i16> @zext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_16i8_to_8i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_16i8_to_8i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_16i8_to_8i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: retq +; +; AVX-LABEL: zext_16i8_to_8i16: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX-NEXT: retq +entry: + %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> + %C = zext <8 x i8> %B to <8 x i16> + ret <8 x i16> %C +} + +; PR17654 +define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %A) { +; SSE2-LABEL: zext_16i8_to_16i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_16i8_to_16i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_16i8_to_16i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_16i8_to_16i16: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_16i8_to_16i16: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_16i8_to_16i16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512-NEXT: retq +entry: + %B = zext <16 x i8> %A to <16 x i16> + ret <16 x i16> %B +} + +define <32 x i16> @zext_32i8_to_32i16(<32 x i8> %A) { +; SSE2-LABEL: zext_32i8_to_32i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_32i8_to_32i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_32i8_to_32i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_32i8_to_32i16: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_32i8_to_32i16: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: zext_32i8_to_32i16: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512F-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: zext_32i8_to_32i16: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BW-NEXT: retq +entry: + %B = zext <32 x i8> %A to <32 x i16> + ret <32 x i16> %B +} + +define <4 x i32> @zext_16i8_to_4i32(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_16i8_to_4i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_16i8_to_4i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_16i8_to_4i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: zext_16i8_to_4i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX-NEXT: retq +entry: + %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> + %C = zext <4 x i8> %B to <4 x i32> + ret <4 x i32> %C +} + +define <8 x i32> @zext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_16i8_to_8i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_16i8_to_8i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_16i8_to_8i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_16i8_to_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_16i8_to_8i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_16i8_to_8i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: retq +entry: + %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> + %C = zext <8 x i8> %B to <8 x i32> + ret <8 x i32> %C +} + +define <16 x i32> @zext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_16i8_to_16i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_16i8_to_16i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_16i8_to_16i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_16i8_to_16i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_16i8_to_16i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_16i8_to_16i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512-NEXT: retq +entry: + %B = zext <16 x i8> %A to <16 x i32> + ret <16 x i32> %B +} + +define <2 x i64> @zext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_16i8_to_2i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_16i8_to_2i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_16i8_to_2i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: zext_16i8_to_2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: retq +entry: + %B = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> + %C = zext <2 x i8> %B to <2 x i64> + ret <2 x i64> %C +} + +define <4 x i64> @zext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_16i8_to_4i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_16i8_to_4i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_16i8_to_4i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_16i8_to_4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_16i8_to_4i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_16i8_to_4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: retq +entry: + %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> + %C = zext <4 x i8> %B to <4 x i64> + ret <4 x i64> %C +} + +define <8 x i64> @zext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_16i8_to_8i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_16i8_to_8i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,128,128,128,128,128,128,128,1,128,128,128,128,128,128,128] +; SSSE3-NEXT: pshufb %xmm4, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2,128,128,128,128,128,128,128,3,128,128,128,128,128,128,128] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] +; SSSE3-NEXT: pshufb %xmm5, %xmm1 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: pshufb %xmm4, %xmm2 +; SSSE3-NEXT: pshufb %xmm5, %xmm3 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_16i8_to_8i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psrlq $48, %xmm0 +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_16i8_to_8i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_16i8_to_8i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_16i8_to_8i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: retq +entry: + %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> + %C = zext <8 x i8> %B to <8 x i64> + ret <8 x i64> %C +} + +define <4 x i32> @zext_8i16_to_4i32(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_8i16_to_4i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_8i16_to_4i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_8i16_to_4i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: retq +; +; AVX-LABEL: zext_8i16_to_4i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: retq +entry: + %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> + %C = zext <4 x i16> %B to <4 x i32> + ret <4 x i32> %C +} + +define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_8i16_to_8i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_8i16_to_8i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_8i16_to_8i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_8i16_to_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_8i16_to_8i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_8i16_to_8i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: retq +entry: + %B = zext <8 x i16> %A to <8 x i32> + ret <8 x i32>%B +} + +define <16 x i32> @zext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_16i16_to_16i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_16i16_to_16i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_16i16_to_16i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_16i16_to_16i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_16i16_to_16i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_16i16_to_16i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512-NEXT: retq +entry: + %B = zext <16 x i16> %A to <16 x i32> + ret <16 x i32> %B +} + +define <2 x i64> @zext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_8i16_to_2i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_8i16_to_2i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_8i16_to_2i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: zext_8i16_to_2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX-NEXT: retq +entry: + %B = shufflevector <8 x i16> %A, <8 x i16> undef, <2 x i32> + %C = zext <2 x i16> %B to <2 x i64> + ret <2 x i64> %C +} + +define <4 x i64> @zext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_8i16_to_4i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_8i16_to_4i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_8i16_to_4i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_8i16_to_4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_8i16_to_4i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_8i16_to_4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512-NEXT: retq +entry: + %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> + %C = zext <4 x i16> %B to <4 x i64> + ret <4 x i64> %C +} + +define <8 x i64> @zext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_8i16_to_8i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_8i16_to_8i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_8i16_to_8i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_8i16_to_8i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_8i16_to_8i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_8i16_to_8i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: retq +entry: + %B = zext <8 x i16> %A to <8 x i64> + ret <8 x i64> %B +} + +define <2 x i64> @zext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_4i32_to_2i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_4i32_to_2i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_4i32_to_2i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: retq +; +; AVX-LABEL: zext_4i32_to_2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: retq +entry: + %B = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> + %C = zext <2 x i32> %B to <2 x i64> + ret <2 x i64> %C +} + +define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_4i32_to_4i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_4i32_to_4i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movaps %xmm0, %xmm1 +; SSSE3-NEXT: xorps %xmm2, %xmm2 +; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_4i32_to_4i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_4i32_to_4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_4i32_to_4i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_4i32_to_4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT: retq +entry: + %B = zext <4 x i32> %A to <4 x i64> + ret <4 x i64>%B +} + +define <8 x i64> @zext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_8i32_to_8i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: xorps %xmm4, %xmm4 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: movaps %xmm3, %xmm2 +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE2-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_8i32_to_8i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movaps %xmm1, %xmm3 +; SSSE3-NEXT: movaps %xmm0, %xmm1 +; SSSE3-NEXT: xorps %xmm4, %xmm4 +; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSSE3-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSSE3-NEXT: movaps %xmm3, %xmm2 +; SSSE3-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSSE3-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_8i32_to_8i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_8i32_to_8i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_8i32_to_8i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_8i32_to_8i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; AVX512-NEXT: retq +entry: + %B = zext <8 x i32> %A to <8 x i64> + ret <8 x i64>%B +} + +define <2 x i64> @load_zext_2i8_to_2i64(<2 x i8> *%ptr) { +; SSE2-LABEL: load_zext_2i8_to_2i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movzwl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_zext_2i8_to_2i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movzwl (%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_zext_2i8_to_2i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: load_zext_2i8_to_2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: retq +entry: + %X = load <2 x i8>, <2 x i8>* %ptr + %Y = zext <2 x i8> %X to <2 x i64> + ret <2 x i64> %Y +} + +define <4 x i32> @load_zext_4i8_to_4i32(<4 x i8> *%ptr) { +; SSE2-LABEL: load_zext_4i8_to_4i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_zext_4i8_to_4i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_zext_4i8_to_4i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: load_zext_4i8_to_4i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX-NEXT: retq +entry: + %X = load <4 x i8>, <4 x i8>* %ptr + %Y = zext <4 x i8> %X to <4 x i32> + ret <4 x i32> %Y +} + +define <4 x i64> @load_zext_4i8_to_4i64(<4 x i8> *%ptr) { +; SSE2-LABEL: load_zext_4i8_to_4i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_zext_4i8_to_4i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_zext_4i8_to_4i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_zext_4i8_to_4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_zext_4i8_to_4i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_zext_4i8_to_4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: retq +entry: + %X = load <4 x i8>, <4 x i8>* %ptr + %Y = zext <4 x i8> %X to <4 x i64> + ret <4 x i64> %Y +} + +define <8 x i16> @load_zext_8i8_to_8i16(<8 x i8> *%ptr) { +; SSE2-LABEL: load_zext_8i8_to_8i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_zext_8i8_to_8i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_zext_8i8_to_8i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; SSE41-NEXT: retq +; +; AVX-LABEL: load_zext_8i8_to_8i16: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX-NEXT: retq +entry: + %X = load <8 x i8>, <8 x i8>* %ptr + %Y = zext <8 x i8> %X to <8 x i16> + ret <8 x i16> %Y +} + +define <8 x i32> @load_zext_8i8_to_8i32(<8 x i8> *%ptr) { +; SSE2-LABEL: load_zext_8i8_to_8i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_zext_8i8_to_8i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_zext_8i8_to_8i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_zext_8i8_to_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_zext_8i8_to_8i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_zext_8i8_to_8i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX512-NEXT: retq +entry: + %X = load <8 x i8>, <8 x i8>* %ptr + %Y = zext <8 x i8> %X to <8 x i32> + ret <8 x i32> %Y +} + +define <8 x i32> @load_zext_16i8_to_8i32(<16 x i8> *%ptr) { +; SSE2-LABEL: load_zext_16i8_to_8i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_zext_16i8_to_8i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa (%rdi), %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_zext_16i8_to_8i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa (%rdi), %xmm1 +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_zext_16i8_to_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_zext_16i8_to_8i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_zext_16i8_to_8i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX512-NEXT: retq +entry: + %X = load <16 x i8>, <16 x i8>* %ptr + %Y = shufflevector <16 x i8> %X, <16 x i8> undef, <8 x i32> + %Z = zext <8 x i8> %Y to <8 x i32> + ret <8 x i32> %Z +} + +define <8 x i64> @load_zext_8i8_to_8i64(<8 x i8> *%ptr) { +; SSE2-LABEL: load_zext_8i8_to_8i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_zext_8i8_to_8i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,128,128,128,128,128,128,128,1,128,128,128,128,128,128,128] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pshufb %xmm4, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2,128,128,128,128,128,128,128,3,128,128,128,128,128,128,128] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] +; SSSE3-NEXT: pshufb %xmm5, %xmm1 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: pshufb %xmm4, %xmm2 +; SSSE3-NEXT: pshufb %xmm5, %xmm3 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_zext_8i8_to_8i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm3 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_zext_8i8_to_8i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_zext_8i8_to_8i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_zext_8i8_to_8i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: retq +entry: + %X = load <8 x i8>, <8 x i8>* %ptr + %Y = zext <8 x i8> %X to <8 x i64> + ret <8 x i64> %Y +} + +define <16 x i16> @load_zext_16i8_to_16i16(<16 x i8> *%ptr) { +; SSE2-LABEL: load_zext_16i8_to_16i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_zext_16i8_to_16i16: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa (%rdi), %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_zext_16i8_to_16i16: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_zext_16i8_to_16i16: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_zext_16i8_to_16i16: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_zext_16i8_to_16i16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512-NEXT: retq +entry: + %X = load <16 x i8>, <16 x i8>* %ptr + %Y = zext <16 x i8> %X to <16 x i16> + ret <16 x i16> %Y +} + +define <2 x i64> @load_zext_2i16_to_2i64(<2 x i16> *%ptr) { +; SSE2-LABEL: load_zext_2i16_to_2i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_zext_2i16_to_2i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_zext_2i16_to_2i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: load_zext_2i16_to_2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; AVX-NEXT: retq +entry: + %X = load <2 x i16>, <2 x i16>* %ptr + %Y = zext <2 x i16> %X to <2 x i64> + ret <2 x i64> %Y +} + +define <4 x i32> @load_zext_4i16_to_4i32(<4 x i16> *%ptr) { +; SSE2-LABEL: load_zext_4i16_to_4i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_zext_4i16_to_4i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_zext_4i16_to_4i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; SSE41-NEXT: retq +; +; AVX-LABEL: load_zext_4i16_to_4i32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX-NEXT: retq +entry: + %X = load <4 x i16>, <4 x i16>* %ptr + %Y = zext <4 x i16> %X to <4 x i32> + ret <4 x i32> %Y +} + +define <4 x i64> @load_zext_4i16_to_4i64(<4 x i16> *%ptr) { +; SSE2-LABEL: load_zext_4i16_to_4i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_zext_4i16_to_4i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_zext_4i16_to_4i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_zext_4i16_to_4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_zext_4i16_to_4i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_zext_4i16_to_4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX512-NEXT: retq +entry: + %X = load <4 x i16>, <4 x i16>* %ptr + %Y = zext <4 x i16> %X to <4 x i64> + ret <4 x i64> %Y +} + +define <8 x i32> @load_zext_8i16_to_8i32(<8 x i16> *%ptr) { +; SSE2-LABEL: load_zext_8i16_to_8i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_zext_8i16_to_8i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa (%rdi), %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_zext_8i16_to_8i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_zext_8i16_to_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_zext_8i16_to_8i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_zext_8i16_to_8i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512-NEXT: retq +entry: + %X = load <8 x i16>, <8 x i16>* %ptr + %Y = zext <8 x i16> %X to <8 x i32> + ret <8 x i32> %Y +} + +define <2 x i64> @load_zext_2i32_to_2i64(<2 x i32> *%ptr) { +; SSE2-LABEL: load_zext_2i32_to_2i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_zext_2i32_to_2i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_zext_2i32_to_2i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; SSE41-NEXT: retq +; +; AVX-LABEL: load_zext_2i32_to_2i64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; AVX-NEXT: retq +entry: + %X = load <2 x i32>, <2 x i32>* %ptr + %Y = zext <2 x i32> %X to <2 x i64> + ret <2 x i64> %Y +} + +define <4 x i64> @load_zext_4i32_to_4i64(<4 x i32> *%ptr) { +; SSE2-LABEL: load_zext_4i32_to_4i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movaps (%rdi), %xmm1 +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_zext_4i32_to_4i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movaps (%rdi), %xmm1 +; SSSE3-NEXT: xorps %xmm2, %xmm2 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_zext_4i32_to_4i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_zext_4i32_to_4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_zext_4i32_to_4i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_zext_4i32_to_4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX512-NEXT: retq +entry: + %X = load <4 x i32>, <4 x i32>* %ptr + %Y = zext <4 x i32> %X to <4 x i64> + ret <4 x i64> %Y +} + +define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) { +; SSE2-LABEL: zext_8i8_to_8i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_8i8_to_8i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_8i8_to_8i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_8i8_to_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_8i8_to_8i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_8i8_to_8i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: retq +entry: + %t = zext <8 x i8> %z to <8 x i32> + ret <8 x i32> %t +} + +define <8 x i32> @shuf_zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: shuf_zext_8i16_to_8i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuf_zext_8i16_to_8i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuf_zext_8i16_to_8i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuf_zext_8i16_to_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuf_zext_8i16_to_8i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: shuf_zext_8i16_to_8i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: retq +entry: + %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> + %Z = bitcast <16 x i16> %B to <8 x i32> + ret <8 x i32> %Z +} + +define <4 x i64> @shuf_zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: shuf_zext_4i32_to_4i64: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuf_zext_4i32_to_4i64: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movaps %xmm0, %xmm1 +; SSSE3-NEXT: xorps %xmm2, %xmm2 +; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuf_zext_4i32_to_4i64: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuf_zext_4i32_to_4i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuf_zext_4i32_to_4i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: shuf_zext_4i32_to_4i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT: retq +entry: + %B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <8 x i32> + %Z = bitcast <8 x i32> %B to <4 x i64> + ret <4 x i64> %Z +} + +define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) { +; SSE2-LABEL: shuf_zext_8i8_to_8i32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuf_zext_8i8_to_8i32: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuf_zext_8i8_to_8i32: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuf_zext_8i8_to_8i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuf_zext_8i8_to_8i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: shuf_zext_8i8_to_8i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: retq +entry: + %B = shufflevector <8 x i8> %A, <8 x i8> zeroinitializer, <32 x i32> + %Z = bitcast <32 x i8> %B to <8 x i32> + ret <8 x i32> %Z +} + +define <2 x i64> @shuf_zext_16i8_to_2i64_offset6(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: shuf_zext_16i8_to_2i64_offset6: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuf_zext_16i8_to_2i64_offset6: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuf_zext_16i8_to_2i64_offset6: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: psrlq $48, %xmm0 +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: shuf_zext_16i8_to_2i64_offset6: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: retq +entry: + %B = shufflevector <16 x i8> %A, <16 x i8> zeroinitializer, <16 x i32> + %Z = bitcast <16 x i8> %B to <2 x i64> + ret <2 x i64> %Z +} + +define <4 x i64> @shuf_zext_16i8_to_4i64_offset11(<16 x i8> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: shuf_zext_16i8_to_4i64_offset11: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuf_zext_16i8_to_4i64_offset11: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[11],zero,zero,zero,zero,zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[13],zero,zero,zero,zero,zero,zero,zero,xmm1[14],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuf_zext_16i8_to_4i64_offset11: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrldq {{.*#+}} xmm1 = xmm1[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: psrldq {{.*#+}} xmm0 = xmm0[13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuf_zext_16i8_to_4i64_offset11: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuf_zext_16i8_to_4i64_offset11: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: shuf_zext_16i8_to_4i64_offset11: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: retq +entry: + %B = shufflevector <16 x i8> %A, <16 x i8> zeroinitializer, <32 x i32> + %Z = bitcast <32 x i8> %B to <4 x i64> + ret <4 x i64> %Z +} + +define <2 x i64> @shuf_zext_8i16_to_2i64_offset6(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: shuf_zext_8i16_to_2i64_offset6: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuf_zext_8i16_to_2i64_offset6: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7],zero,zero,zero,zero,zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuf_zext_8i16_to_2i64_offset6: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: shuf_zext_8i16_to_2i64_offset6: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX-NEXT: retq +entry: + %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <8 x i32> + %Z = bitcast <8 x i16> %B to <2 x i64> + ret <2 x i64> %Z +} + +define <4 x i64> @shuf_zext_8i16_to_4i64_offset2(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: shuf_zext_8i16_to_4i64_offset2: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuf_zext_8i16_to_4i64_offset2: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuf_zext_8i16_to_4i64_offset2: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuf_zext_8i16_to_4i64_offset2: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuf_zext_8i16_to_4i64_offset2: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: shuf_zext_8i16_to_4i64_offset2: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3] +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512-NEXT: retq +entry: + %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> + %Z = bitcast <16 x i16> %B to <4 x i64> + ret <4 x i64> %Z +} + +define <4 x i32> @shuf_zext_8i16_to_4i32_offset1(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: shuf_zext_8i16_to_4i32_offset1: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuf_zext_8i16_to_4i32_offset1: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuf_zext_8i16_to_4i32_offset1: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuf_zext_8i16_to_4i32_offset1: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: shuf_zext_8i16_to_4i32_offset1: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuf_zext_8i16_to_4i32_offset1: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,xmm0[4,5],zero,zero,xmm0[6,7],zero,zero,xmm0[8,9],zero,zero +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: shuf_zext_8i16_to_4i32_offset1: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: shuf_zext_8i16_to_4i32_offset1: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,xmm0[4,5],zero,zero,xmm0[6,7],zero,zero,xmm0[8,9],zero,zero +; AVX512BW-NEXT: retq +entry: + %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <8 x i32> + %Z = bitcast <8 x i16> %B to <4 x i32> + ret <4 x i32> %Z +} + +define <8 x i32> @shuf_zext_8i16_to_8i32_offset3(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: shuf_zext_8i16_to_8i32_offset3: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuf_zext_8i16_to_8i32_offset3: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuf_zext_8i16_to_8i32_offset3: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE41-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuf_zext_8i16_to_8i32_offset3: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuf_zext_8i16_to_8i32_offset3: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: shuf_zext_8i16_to_8i32_offset3: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: retq +entry: + %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> + %Z = bitcast <16 x i16> %B to <8 x i32> + ret <8 x i32> %Z +} + +define <8 x i32> @shuf_zext_16i16_to_8i32_offset8(<16 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: shuf_zext_16i16_to_8i32_offset8: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuf_zext_16i16_to_8i32_offset8: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuf_zext_16i16_to_8i32_offset8: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuf_zext_16i16_to_8i32_offset8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuf_zext_16i16_to_8i32_offset8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: shuf_zext_16i16_to_8i32_offset8: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: retq +entry: + %B = shufflevector <16 x i16> %A, <16 x i16> zeroinitializer, <16 x i32> + %Z = bitcast <16 x i16> %B to <8 x i32> + ret <8 x i32> %Z +} + +define <2 x i64> @shuf_zext_4i32_to_2i64_offset2(<4 x i32> %A) nounwind uwtable readnone ssp { +; SSE-LABEL: shuf_zext_4i32_to_2i64_offset2: +; SSE: # %bb.0: # %entry +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: retq +; +; AVX-LABEL: shuf_zext_4i32_to_2i64_offset2: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: retq +entry: + %B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <4 x i32> + %Z = bitcast <4 x i32> %B to <2 x i64> + ret <2 x i64> %Z +} + +define <4 x i64> @shuf_zext_4i32_to_4i64_offset1(<4 x i32> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: shuf_zext_4i32_to_4i64_offset1: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,0,4294967295,0] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuf_zext_4i32_to_4i64_offset1: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,0,4294967295,0] +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: psrldq {{.*#+}} xmm1 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuf_zext_4i32_to_4i64_offset1: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] +; SSE41-NEXT: psrldq {{.*#+}} xmm1 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuf_zext_4i32_to_4i64_offset1: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuf_zext_4i32_to_4i64_offset1: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3] +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: retq +; +; AVX512-LABEL: shuf_zext_4i32_to_4i64_offset1: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3] +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT: retq +entry: + %B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <8 x i32> + %Z = bitcast <8 x i32> %B to <4 x i64> + ret <4 x i64> %Z +} + +define <32 x i32> @zext_32i8_to_32i32(<32 x i8> %x) { +; SSE2-LABEL: zext_32i8_to_32i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %rdi, %rax +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: movdqa %xmm1, 112(%rdi) +; SSE2-NEXT: movdqa %xmm4, 96(%rdi) +; SSE2-NEXT: movdqa %xmm6, 80(%rdi) +; SSE2-NEXT: movdqa %xmm7, 64(%rdi) +; SSE2-NEXT: movdqa %xmm0, 48(%rdi) +; SSE2-NEXT: movdqa %xmm5, 32(%rdi) +; SSE2-NEXT: movdqa %xmm3, 16(%rdi) +; SSE2-NEXT: movdqa %xmm8, (%rdi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_32i8_to_32i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movq %rdi, %rax +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSSE3-NEXT: movdqa %xmm3, %xmm8 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSSE3-NEXT: movdqa %xmm0, %xmm5 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSSE3-NEXT: movdqa %xmm1, %xmm6 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; SSSE3-NEXT: movdqa %xmm6, %xmm7 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: movdqa %xmm1, 112(%rdi) +; SSSE3-NEXT: movdqa %xmm4, 96(%rdi) +; SSSE3-NEXT: movdqa %xmm6, 80(%rdi) +; SSSE3-NEXT: movdqa %xmm7, 64(%rdi) +; SSSE3-NEXT: movdqa %xmm0, 48(%rdi) +; SSSE3-NEXT: movdqa %xmm5, 32(%rdi) +; SSSE3-NEXT: movdqa %xmm3, 16(%rdi) +; SSSE3-NEXT: movdqa %xmm8, (%rdi) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_32i8_to_32i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movq %rdi, %rax +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,0,1] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE41-NEXT: movdqa %xmm1, 112(%rdi) +; SSE41-NEXT: movdqa %xmm7, 96(%rdi) +; SSE41-NEXT: movdqa %xmm6, 80(%rdi) +; SSE41-NEXT: movdqa %xmm5, 64(%rdi) +; SSE41-NEXT: movdqa %xmm0, 48(%rdi) +; SSE41-NEXT: movdqa %xmm4, 32(%rdi) +; SSE41-NEXT: movdqa %xmm3, 16(%rdi) +; SSE41-NEXT: movdqa %xmm2, (%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_32i8_to_32i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-NEXT: vmovaps %ymm4, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_32i8_to_32i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vmovdqa %ymm4, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_32i8_to_32i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-NEXT: retq + %res = zext <32 x i8>%x to <32 x i32> + ret <32 x i32> %res +} + +define <2 x i32> @zext_2i8_to_2i32(<2 x i8>* %addr) { +; SSE2-LABEL: zext_2i8_to_2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movzwl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: paddd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_2i8_to_2i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movzwl (%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: paddd %xmm0, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_2i8_to_2i32: +; SSE41: # %bb.0: +; SSE41-NEXT: movzbl 1(%rdi), %eax +; SSE41-NEXT: movzbl (%rdi), %ecx +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: pinsrd $1, %eax, %xmm0 +; SSE41-NEXT: paddd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: zext_2i8_to_2i32: +; AVX: # %bb.0: +; AVX-NEXT: movzbl 1(%rdi), %eax +; AVX-NEXT: movzbl (%rdi), %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq + %x = load <2 x i8>, <2 x i8>* %addr, align 1 + %y = zext <2 x i8> %x to <2 x i32> + %z = add <2 x i32>%y, %y + ret <2 x i32>%z +} -- 2.34.1