From c823517ef5f2ac0fafede18782d089a8aa91ed7b Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Fri, 30 Dec 2022 01:43:20 +0300 Subject: [PATCH] [NFC][Codegen][X86] zero_extend_vector_inreg.ll: add SSE4.2 runline --- llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll | 2719 ++++++++++++++------- 1 file changed, 1799 insertions(+), 920 deletions(-) diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll index e7ea375..850eb01 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll @@ -1,26 +1,36 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,FALLBACK0 -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1-ONLY,FALLBACK1 -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW,FALLBACK2 -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST-PERLANE,FALLBACK3 -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST,FALLBACK4 -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512F,AVX512F-SLOW,FALLBACK5 -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512F,AVX512F-FAST,FALLBACK6 -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW,FALLBACK7 -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST,FALLBACK8 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,FALLBACK0 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42,FALLBACK1 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1-ONLY,FALLBACK2 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW,FALLBACK3 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST-PERLANE,FALLBACK4 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST,FALLBACK5 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512F,AVX512F-SLOW,FALLBACK6 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512F,AVX512F-FAST,FALLBACK7 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW,FALLBACK8 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST,FALLBACK9 define void @vec16_v2i8_to_v1i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec16_v2i8_to_v1i16_factor2: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec16_v2i8_to_v1i16_factor2: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec16_v2i8_to_v1i16_factor2: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec16_v2i8_to_v1i16_factor2: ; AVX: # %bb.0: @@ -73,15 +83,24 @@ define void @vec16_v2i8_to_v1i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias. } define void @vec32_v4i8_to_v2i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec32_v4i8_to_v2i16_factor2: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec32_v4i8_to_v2i16_factor2: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec32_v4i8_to_v2i16_factor2: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec32_v4i8_to_v2i16_factor2: ; AVX: # %bb.0: @@ -134,17 +153,26 @@ define void @vec32_v4i8_to_v2i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias. } define void @vec32_v4i8_to_v1i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec32_v4i8_to_v1i32_factor4: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec32_v4i8_to_v1i32_factor4: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec32_v4i8_to_v1i32_factor4: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec32_v4i8_to_v1i32_factor4: ; AVX: # %bb.0: @@ -197,16 +225,25 @@ define void @vec32_v4i8_to_v1i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias. } define void @vec32_v2i16_to_v1i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec32_v2i16_to_v1i32_factor2: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec32_v2i16_to_v1i32_factor2: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec32_v2i16_to_v1i32_factor2: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec32_v2i16_to_v1i32_factor2: ; AVX: # %bb.0: @@ -261,15 +298,24 @@ define void @vec32_v2i16_to_v1i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias } define void @vec64_v8i8_to_v4i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec64_v8i8_to_v4i16_factor2: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec64_v8i8_to_v4i16_factor2: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec64_v8i8_to_v4i16_factor2: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec64_v8i8_to_v4i16_factor2: ; AVX: # %bb.0: @@ -322,16 +368,25 @@ define void @vec64_v8i8_to_v4i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias. } define void @vec64_v8i8_to_v2i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec64_v8i8_to_v2i32_factor4: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec64_v8i8_to_v2i32_factor4: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec64_v8i8_to_v2i32_factor4: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec64_v8i8_to_v2i32_factor4: ; AVX: # %bb.0: @@ -384,17 +439,26 @@ define void @vec64_v8i8_to_v2i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias. } define void @vec64_v8i8_to_v1i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec64_v8i8_to_v1i64_factor8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec64_v8i8_to_v1i64_factor8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec64_v8i8_to_v1i64_factor8: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec64_v8i8_to_v1i64_factor8: ; AVX: # %bb.0: @@ -447,15 +511,24 @@ define void @vec64_v8i8_to_v1i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias. } define void @vec64_v4i16_to_v2i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec64_v4i16_to_v2i32_factor2: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec64_v4i16_to_v2i32_factor2: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec64_v4i16_to_v2i32_factor2: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec64_v4i16_to_v2i32_factor2: ; AVX: # %bb.0: @@ -510,16 +583,25 @@ define void @vec64_v4i16_to_v2i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias } define void @vec64_v4i16_to_v1i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec64_v4i16_to_v1i64_factor4: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec64_v4i16_to_v1i64_factor4: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec64_v4i16_to_v1i64_factor4: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec64_v4i16_to_v1i64_factor4: ; AVX: # %bb.0: @@ -574,15 +656,24 @@ define void @vec64_v4i16_to_v1i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias } define void @vec64_v2i32_to_v1i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec64_v2i32_to_v1i64_factor2: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec64_v2i32_to_v1i64_factor2: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec64_v2i32_to_v1i64_factor2: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec64_v2i32_to_v1i64_factor2: ; AVX: # %bb.0: @@ -637,15 +728,24 @@ define void @vec64_v2i32_to_v1i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias } define void @vec128_v16i8_to_v8i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec128_v16i8_to_v8i16_factor2: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec128_v16i8_to_v8i16_factor2: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec128_v16i8_to_v8i16_factor2: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec128_v16i8_to_v8i16_factor2: ; AVX: # %bb.0: @@ -698,16 +798,25 @@ define void @vec128_v16i8_to_v8i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bia } define void @vec128_v16i8_to_v4i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec128_v16i8_to_v4i32_factor4: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec128_v16i8_to_v4i32_factor4: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec128_v16i8_to_v4i32_factor4: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec128_v16i8_to_v4i32_factor4: ; AVX: # %bb.0: @@ -760,17 +869,26 @@ define void @vec128_v16i8_to_v4i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bia } define void @vec128_v16i8_to_v2i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec128_v16i8_to_v2i64_factor8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec128_v16i8_to_v2i64_factor8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec128_v16i8_to_v2i64_factor8: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec128_v16i8_to_v2i64_factor8: ; AVX: # %bb.0: @@ -883,15 +1001,24 @@ define void @vec128_v16i8_to_v1i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b } define void @vec128_v8i16_to_v4i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec128_v8i16_to_v4i32_factor2: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec128_v8i16_to_v4i32_factor2: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec128_v8i16_to_v4i32_factor2: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec128_v8i16_to_v4i32_factor2: ; AVX: # %bb.0: @@ -946,16 +1073,25 @@ define void @vec128_v8i16_to_v4i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bia } define void @vec128_v8i16_to_v2i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec128_v8i16_to_v2i64_factor4: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec128_v8i16_to_v2i64_factor4: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec128_v8i16_to_v2i64_factor4: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec128_v8i16_to_v2i64_factor4: ; AVX: # %bb.0: @@ -1010,14 +1146,24 @@ define void @vec128_v8i16_to_v2i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bia } define void @vec128_v8i16_to_v1i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec128_v8i16_to_v1i128_factor8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec128_v8i16_to_v1i128_factor8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec128_v8i16_to_v1i128_factor8: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec128_v8i16_to_v1i128_factor8: ; AVX: # %bb.0: @@ -1076,15 +1222,24 @@ define void @vec128_v8i16_to_v1i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bi } define void @vec128_v4i32_to_v2i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec128_v4i32_to_v2i64_factor2: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec128_v4i32_to_v2i64_factor2: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec128_v4i32_to_v2i64_factor2: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec128_v4i32_to_v2i64_factor2: ; AVX: # %bb.0: @@ -1139,15 +1294,25 @@ define void @vec128_v4i32_to_v2i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bia } define void @vec128_v4i32_to_v1i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec128_v4i32_to_v1i128_factor4: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: paddb (%rdx), %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec128_v4i32_to_v1i128_factor4: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE2-NEXT: paddb (%rdx), %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec128_v4i32_to_v1i128_factor4: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec128_v4i32_to_v1i128_factor4: ; AVX: # %bb.0: @@ -1268,19 +1433,32 @@ define void @vec128_v2i64_to_v1i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi } define void @vec256_v32i8_to_v16i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec256_v32i8_to_v16i16_factor2: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE-NEXT: paddb 16(%rdx), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm0, 16(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec256_v32i8_to_v16i16_factor2: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec256_v32i8_to_v16i16_factor2: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_v32i8_to_v16i16_factor2: ; AVX: # %bb.0: @@ -1337,20 +1515,33 @@ define void @vec256_v32i8_to_v16i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi } define void @vec256_v32i8_to_v8i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec256_v32i8_to_v8i32_factor4: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: paddb 16(%rdx), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm0, 16(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec256_v32i8_to_v8i32_factor4: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec256_v32i8_to_v8i32_factor4: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_v32i8_to_v8i32_factor4: ; AVX: # %bb.0: @@ -1407,21 +1598,34 @@ define void @vec256_v32i8_to_v8i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bia } define void @vec256_v32i8_to_v4i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec256_v32i8_to_v4i64_factor8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: paddb 16(%rdx), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm0, 16(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec256_v32i8_to_v4i64_factor8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec256_v32i8_to_v4i64_factor8: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: psrld $16, %xmm0 +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_v32i8_to_v4i64_factor8: ; AVX: # %bb.0: @@ -1620,19 +1824,32 @@ define void @vec256_v32i8_to_v1i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b } define void @vec256_v16i16_to_v8i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec256_v16i16_to_v8i32_factor2: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: paddb 16(%rdx), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm0, 16(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec256_v16i16_to_v8i32_factor2: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec256_v16i16_to_v8i32_factor2: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_v16i16_to_v8i32_factor2: ; AVX: # %bb.0: @@ -1691,20 +1908,33 @@ define void @vec256_v16i16_to_v8i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi } define void @vec256_v16i16_to_v4i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec256_v16i16_to_v4i64_factor4: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: paddb 16(%rdx), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm0, 16(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec256_v16i16_to_v4i64_factor4: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec256_v16i16_to_v4i64_factor4: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_v16i16_to_v4i64_factor4: ; AVX: # %bb.0: @@ -1763,19 +1993,33 @@ define void @vec256_v16i16_to_v4i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bi } define void @vec256_v16i16_to_v2i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec256_v16i16_to_v2i128_factor8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: paddb 16(%rdx), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rcx) -; SSE-NEXT: movdqa %xmm0, 16(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec256_v16i16_to_v2i128_factor8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec256_v16i16_to_v2i128_factor8: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_v16i16_to_v2i128_factor8: ; AVX: # %bb.0: @@ -1843,16 +2087,28 @@ define void @vec256_v16i16_to_v2i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b } define void @vec256_v16i16_to_v1i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec256_v16i16_to_v1i256_factor16: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: movaps 16(%rdx), %xmm1 -; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movaps %xmm1, 16(%rcx) -; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec256_v16i16_to_v1i256_factor16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movaps 16(%rdx), %xmm1 +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: movaps %xmm1, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec256_v16i16_to_v1i256_factor16: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; SSE42-NEXT: movaps 16(%rdx), %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movaps %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_v16i16_to_v1i256_factor16: ; AVX: # %bb.0: @@ -1913,19 +2169,32 @@ define void @vec256_v16i16_to_v1i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec. } define void @vec256_v8i32_to_v4i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec256_v8i32_to_v4i64_factor2: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: paddb 16(%rdx), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm0, 16(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec256_v8i32_to_v4i64_factor2: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec256_v8i32_to_v4i64_factor2: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_v8i32_to_v4i64_factor2: ; AVX: # %bb.0: @@ -1984,20 +2253,35 @@ define void @vec256_v8i32_to_v4i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bia } define void @vec256_v8i32_to_v2i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec256_v8i32_to_v2i128_factor4: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: xorps %xmm2, %xmm2 -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] -; SSE-NEXT: paddb 16(%rdx), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm0, 16(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec256_v8i32_to_v2i128_factor4: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec256_v8i32_to_v2i128_factor4: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_v8i32_to_v2i128_factor4: ; AVX: # %bb.0: @@ -2092,17 +2376,29 @@ define void @vec256_v8i32_to_v2i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bi } define void @vec256_v8i32_to_v1i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec256_v8i32_to_v1i256_factor8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 16(%rdx), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm1 -; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movdqa %xmm1, (%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec256_v8i32_to_v1i256_factor8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE2-NEXT: movaps 16(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm1 +; SSE2-NEXT: movaps %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec256_v8i32_to_v1i256_factor8: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE42-NEXT: movaps 16(%rdx), %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movaps %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_v8i32_to_v1i256_factor8: ; AVX: # %bb.0: @@ -2367,24 +2663,42 @@ define void @vec256_v2i128_to_v1i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.b } define void @vec384_v48i8_to_v24i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec384_v48i8_to_v24i16_factor2: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: paddb 16(%rsi), %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE-NEXT: paddb 16(%rdx), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm3 -; SSE-NEXT: paddb 32(%rdx), %xmm1 -; SSE-NEXT: movdqa %xmm1, 32(%rcx) -; SSE-NEXT: movdqa %xmm3, (%rcx) -; SSE-NEXT: movdqa %xmm0, 16(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec384_v48i8_to_v24i16_factor2: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: paddb 32(%rdx), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 32(%rcx) +; SSE2-NEXT: movdqa %xmm3, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec384_v48i8_to_v24i16_factor2: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: movdqa 16(%rdi), %xmm1 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE42-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE42-NEXT: pxor %xmm3, %xmm3 +; SSE42-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm2 +; SSE42-NEXT: paddb 32(%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 32(%rcx) +; SSE42-NEXT: movdqa %xmm2, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v48i8_to_v24i16_factor2: ; AVX: # %bb.0: @@ -2454,28 +2768,45 @@ define void @vec384_v48i8_to_v24i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi } define void @vec384_v48i8_to_v16i24_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec384_v48i8_to_v16i24_factor3: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,5] -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,6] -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: paddb 32(%rdx), %xmm2 -; SSE-NEXT: paddb 16(%rdx), %xmm1 -; SSE-NEXT: movdqa %xmm1, 16(%rcx) -; SSE-NEXT: movdqa %xmm2, 32(%rcx) -; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec384_v48i8_to_v16i24_factor3: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,5] +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,2,2,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,6] +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 32(%rdx), %xmm2 +; SSE2-NEXT: paddb 16(%rdx), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 16(%rcx) +; SSE2-NEXT: movdqa %xmm2, 32(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec384_v48i8_to_v16i24_factor3: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: movdqa %xmm0, %xmm1 +; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[11],zero,zero,xmm1[12],zero,zero,xmm1[13],zero,zero,xmm1[14],zero,zero,xmm1[15],zero,zero +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero,zero,xmm2[5] +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[6],zero,zero,xmm0[7],zero,zero,xmm0[8],zero,zero,xmm0[9],zero,zero,xmm0[10],zero +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm2 +; SSE42-NEXT: paddb 32(%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 32(%rcx) +; SSE42-NEXT: movdqa %xmm2, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v48i8_to_v16i24_factor3: ; AVX: # %bb.0: @@ -2546,25 +2877,42 @@ define void @vec384_v48i8_to_v16i24_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi } define void @vec384_v48i8_to_v12i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec384_v48i8_to_v12i32_factor4: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: paddb 16(%rdx), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm3 -; SSE-NEXT: paddb 32(%rdx), %xmm2 -; SSE-NEXT: movdqa %xmm2, 32(%rcx) -; SSE-NEXT: movdqa %xmm3, (%rcx) -; SSE-NEXT: movdqa %xmm0, 16(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec384_v48i8_to_v12i32_factor4: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: paddb 32(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 32(%rcx) +; SSE2-NEXT: movdqa %xmm3, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec384_v48i8_to_v12i32_factor4: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb 32(%rdx), %xmm2 +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: movdqa %xmm2, 32(%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v48i8_to_v12i32_factor4: ; AVX: # %bb.0: @@ -2658,24 +3006,41 @@ define void @vec384_v48i8_to_v12i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bi } define void @vec384_v48i8_to_v8i48_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec384_v48i8_to_v8i48_factor6: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: paddb 16(%rdx), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm2 -; SSE-NEXT: paddb 32(%rdx), %xmm1 -; SSE-NEXT: movdqa %xmm1, 32(%rcx) -; SSE-NEXT: movdqa %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm0, 16(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec384_v48i8_to_v8i48_factor6: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: paddb 32(%rdx), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec384_v48i8_to_v8i48_factor6: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: movdqa %xmm0, %xmm1 +; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[6],zero,zero,zero,zero,zero,xmm1[7],zero,zero,zero,zero,zero +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,xmm2[2],zero,zero,zero +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,xmm0[5],zero +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm2 +; SSE42-NEXT: paddb 32(%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 32(%rcx) +; SSE42-NEXT: movdqa %xmm2, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v48i8_to_v8i48_factor6: ; AVX: # %bb.0: @@ -2749,26 +3114,43 @@ define void @vec384_v48i8_to_v8i48_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bia } define void @vec384_v48i8_to_v6i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec384_v48i8_to_v6i64_factor8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: paddb 16(%rdx), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm3 -; SSE-NEXT: paddb 32(%rdx), %xmm2 -; SSE-NEXT: movdqa %xmm2, 32(%rcx) -; SSE-NEXT: movdqa %xmm3, (%rcx) -; SSE-NEXT: movdqa %xmm0, 16(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec384_v48i8_to_v6i64_factor8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: paddb 32(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 32(%rcx) +; SSE2-NEXT: movdqa %xmm3, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec384_v48i8_to_v6i64_factor8: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: psrld $16, %xmm0 +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb 32(%rdx), %xmm2 +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: movdqa %xmm2, 32(%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v48i8_to_v6i64_factor8: ; AVX: # %bb.0: @@ -2862,28 +3244,45 @@ define void @vec384_v48i8_to_v6i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bia } define void @vec384_v48i8_to_v4i96_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec384_v48i8_to_v4i96_factor12: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: paddb 16(%rdx), %xmm2 -; SSE-NEXT: paddb 32(%rdx), %xmm1 -; SSE-NEXT: movdqa %xmm1, 32(%rcx) -; SSE-NEXT: movdqa %xmm2, 16(%rcx) -; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec384_v48i8_to_v4i96_factor12: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] +; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: paddb 32(%rdx), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec384_v48i8_to_v4i96_factor12: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: movdqa %xmm0, %xmm1 +; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm2 +; SSE42-NEXT: paddb 32(%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 32(%rcx) +; SSE42-NEXT: movdqa %xmm2, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v48i8_to_v4i96_factor12: ; AVX: # %bb.0: @@ -3069,32 +3468,47 @@ define void @vec384_v48i8_to_v3i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias - %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> - %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> - %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> - %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 - %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias - store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 - ret void -} - -define void @vec384_v48i8_to_v2i192_factor24(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec384_v48i8_to_v2i192_factor24: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; SSE-NEXT: movaps 32(%rdx), %xmm2 -; SSE-NEXT: paddb 16(%rdx), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm1 -; SSE-NEXT: movaps %xmm2, 32(%rcx) -; SSE-NEXT: movdqa %xmm1, (%rcx) -; SSE-NEXT: movdqa %xmm0, 16(%rcx) -; SSE-NEXT: retq + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> + %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> + %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec384_v48i8_to_v2i192_factor24(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE2-LABEL: vec384_v48i8_to_v2i192_factor24: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE2-NEXT: movaps 32(%rdx), %xmm2 +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm1 +; SSE2-NEXT: movaps %xmm2, 32(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec384_v48i8_to_v2i192_factor24: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: movdqa %xmm0, %xmm1 +; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE42-NEXT: movaps 32(%rdx), %xmm2 +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm1 +; SSE42-NEXT: movaps %xmm2, 32(%rcx) +; SSE42-NEXT: movdqa %xmm1, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v48i8_to_v2i192_factor24: ; AVX: # %bb.0: @@ -3239,24 +3653,42 @@ define void @vec384_v48i8_to_v1i384_factor48(ptr %in.vec.base.ptr, ptr %in.vec.b } define void @vec384_v24i16_to_v12i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec384_v24i16_to_v12i32_factor2: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: paddb 16(%rsi), %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: paddb 16(%rdx), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm3 -; SSE-NEXT: paddb 32(%rdx), %xmm1 -; SSE-NEXT: movdqa %xmm1, 32(%rcx) -; SSE-NEXT: movdqa %xmm3, (%rcx) -; SSE-NEXT: movdqa %xmm0, 16(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec384_v24i16_to_v12i32_factor2: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: paddb 32(%rdx), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 32(%rcx) +; SSE2-NEXT: movdqa %xmm3, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec384_v24i16_to_v12i32_factor2: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: movdqa 16(%rdi), %xmm1 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE42-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm3 +; SSE42-NEXT: paddb 32(%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 32(%rcx) +; SSE42-NEXT: movdqa %xmm3, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v24i16_to_v12i32_factor2: ; AVX: # %bb.0: @@ -3331,23 +3763,42 @@ define void @vec384_v24i16_to_v12i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.b } define void @vec384_v24i16_to_v8i48_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec384_v24i16_to_v8i48_factor3: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: paddb 16(%rdx), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm2 -; SSE-NEXT: paddb 32(%rdx), %xmm1 -; SSE-NEXT: movdqa %xmm1, 32(%rcx) -; SSE-NEXT: movdqa %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm0, 16(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec384_v24i16_to_v8i48_factor3: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: paddb 32(%rdx), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec384_v24i16_to_v8i48_factor3: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1,2],xmm3[3],xmm1[4,5],xmm3[6],xmm1[7] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm3 +; SSE42-NEXT: paddb 32(%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 32(%rcx) +; SSE42-NEXT: movdqa %xmm3, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v24i16_to_v8i48_factor3: ; AVX: # %bb.0: @@ -3491,25 +3942,42 @@ define void @vec384_v24i16_to_v8i48_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi } define void @vec384_v24i16_to_v6i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec384_v24i16_to_v6i64_factor4: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: paddb 16(%rdx), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm3 -; SSE-NEXT: paddb 32(%rdx), %xmm2 -; SSE-NEXT: movdqa %xmm2, 32(%rcx) -; SSE-NEXT: movdqa %xmm3, (%rcx) -; SSE-NEXT: movdqa %xmm0, 16(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec384_v24i16_to_v6i64_factor4: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: paddb 32(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 32(%rcx) +; SSE2-NEXT: movdqa %xmm3, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec384_v24i16_to_v6i64_factor4: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb 32(%rdx), %xmm2 +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: movdqa %xmm2, 32(%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v24i16_to_v6i64_factor4: ; AVX: # %bb.0: @@ -3607,26 +4075,45 @@ define void @vec384_v24i16_to_v6i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bi } define void @vec384_v24i16_to_v4i96_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec384_v24i16_to_v4i96_factor6: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; SSE-NEXT: paddb 16(%rdx), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm2 -; SSE-NEXT: paddb 32(%rdx), %xmm1 -; SSE-NEXT: movdqa %xmm1, 32(%rcx) -; SSE-NEXT: movdqa %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm0, 16(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec384_v24i16_to_v4i96_factor6: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,1,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: paddb 32(%rdx), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec384_v24i16_to_v4i96_factor6: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2,3],zero,zero +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero +; SSE42-NEXT: psrld $16, %xmm0 +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6,7] +; SSE42-NEXT: paddb 16(%rdx), %xmm3 +; SSE42-NEXT: paddb 32(%rdx), %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 32(%rcx) +; SSE42-NEXT: movdqa %xmm3, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v24i16_to_v4i96_factor6: ; AVX: # %bb.0: @@ -3769,23 +4256,41 @@ define void @vec384_v24i16_to_v4i96_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bi } define void @vec384_v24i16_to_v3i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec384_v24i16_to_v3i128_factor8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: paddb 16(%rdx), %xmm2 -; SSE-NEXT: paddb 32(%rdx), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rcx) -; SSE-NEXT: movdqa %xmm0, 32(%rcx) -; SSE-NEXT: movdqa %xmm2, 16(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec384_v24i16_to_v3i128_factor8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: paddb 32(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec384_v24i16_to_v3i128_factor8: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] +; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: paddb 32(%rdx), %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 32(%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v24i16_to_v3i128_factor8: ; AVX: # %bb.0: @@ -3932,22 +4437,39 @@ define void @vec384_v24i16_to_v3i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b } define void @vec384_v24i16_to_v2i192_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec384_v24i16_to_v2i192_factor12: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; SSE-NEXT: movaps 32(%rdx), %xmm2 -; SSE-NEXT: paddb 16(%rdx), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm1 -; SSE-NEXT: movaps %xmm2, 32(%rcx) -; SSE-NEXT: movdqa %xmm1, (%rcx) -; SSE-NEXT: movdqa %xmm0, 16(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec384_v24i16_to_v2i192_factor12: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE2-NEXT: movaps 32(%rdx), %xmm2 +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm1 +; SSE2-NEXT: movaps %xmm2, 32(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec384_v24i16_to_v2i192_factor12: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] +; SSE42-NEXT: movaps 32(%rdx), %xmm1 +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm2 +; SSE42-NEXT: movaps %xmm1, 32(%rcx) +; SSE42-NEXT: movdqa %xmm2, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v24i16_to_v2i192_factor12: ; AVX: # %bb.0: @@ -4020,18 +4542,32 @@ define void @vec384_v24i16_to_v2i192_factor12(ptr %in.vec.base.ptr, ptr %in.vec. } define void @vec384_v24i16_to_v1i384_factor24(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec384_v24i16_to_v1i384_factor24: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: movaps 16(%rdx), %xmm1 -; SSE-NEXT: movaps 32(%rdx), %xmm2 -; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movaps %xmm1, 16(%rcx) -; SSE-NEXT: movaps %xmm2, 32(%rcx) -; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec384_v24i16_to_v1i384_factor24: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movaps 16(%rdx), %xmm1 +; SSE2-NEXT: movaps 32(%rdx), %xmm2 +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: movaps %xmm1, 16(%rcx) +; SSE2-NEXT: movaps %xmm2, 32(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec384_v24i16_to_v1i384_factor24: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; SSE42-NEXT: movaps 16(%rdx), %xmm0 +; SSE42-NEXT: movaps 32(%rdx), %xmm2 +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movaps %xmm0, 16(%rcx) +; SSE42-NEXT: movaps %xmm2, 32(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v24i16_to_v1i384_factor24: ; AVX: # %bb.0: @@ -4098,24 +4634,42 @@ define void @vec384_v24i16_to_v1i384_factor24(ptr %in.vec.base.ptr, ptr %in.vec. } define void @vec384_v12i32_to_v6i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec384_v12i32_to_v6i64_factor2: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: paddb 16(%rsi), %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: paddb 16(%rdx), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm3 -; SSE-NEXT: paddb 32(%rdx), %xmm1 -; SSE-NEXT: movdqa %xmm1, 32(%rcx) -; SSE-NEXT: movdqa %xmm3, (%rcx) -; SSE-NEXT: movdqa %xmm0, 16(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec384_v12i32_to_v6i64_factor2: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: paddb 32(%rdx), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 32(%rcx) +; SSE2-NEXT: movdqa %xmm3, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec384_v12i32_to_v6i64_factor2: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: movdqa 16(%rdi), %xmm1 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero +; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm3 +; SSE42-NEXT: paddb 32(%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 32(%rcx) +; SSE42-NEXT: movdqa %xmm3, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v12i32_to_v6i64_factor2: ; AVX: # %bb.0: @@ -4193,25 +4747,44 @@ define void @vec384_v12i32_to_v6i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi } define void @vec384_v12i32_to_v4i96_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec384_v12i32_to_v4i96_factor3: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,0,4294967295,0] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,1,3] -; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: paddb 32(%rdx), %xmm3 -; SSE-NEXT: paddb 16(%rdx), %xmm2 -; SSE-NEXT: movdqa %xmm2, 16(%rcx) -; SSE-NEXT: movdqa %xmm3, 32(%rcx) -; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec384_v12i32_to_v4i96_factor3: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,0,4294967295,0] +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 32(%rdx), %xmm3 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm3, 32(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec384_v12i32_to_v4i96_factor3: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 32(%rdx), %xmm3 +; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v12i32_to_v4i96_factor3: ; AVX: # %bb.0: @@ -4340,25 +4913,44 @@ define void @vec384_v12i32_to_v4i96_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi } define void @vec384_v12i32_to_v3i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec384_v12i32_to_v3i128_factor4: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: xorps %xmm2, %xmm2 -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] -; SSE-NEXT: paddb 16(%rdx), %xmm0 -; SSE-NEXT: paddb 32(%rdx), %xmm3 -; SSE-NEXT: paddb (%rdx), %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm3, 32(%rcx) -; SSE-NEXT: movdqa %xmm0, 16(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec384_v12i32_to_v3i128_factor4: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb 32(%rdx), %xmm3 +; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, (%rcx) +; SSE2-NEXT: movdqa %xmm3, 32(%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec384_v12i32_to_v3i128_factor4: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb 32(%rdx), %xmm3 +; SSE42-NEXT: paddb (%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, (%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v12i32_to_v3i128_factor4: ; AVX: # %bb.0: @@ -4490,22 +5082,39 @@ define void @vec384_v12i32_to_v3i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.b } define void @vec384_v12i32_to_v2i192_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec384_v12i32_to_v2i192_factor6: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: xorps %xmm2, %xmm2 -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movaps 32(%rdx), %xmm0 -; SSE-NEXT: paddb 16(%rdx), %xmm1 -; SSE-NEXT: paddb (%rdx), %xmm2 -; SSE-NEXT: movaps %xmm0, 32(%rcx) -; SSE-NEXT: movdqa %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm1, 16(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec384_v12i32_to_v2i192_factor6: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE2-NEXT: movaps 32(%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm1 +; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: movaps %xmm0, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, (%rcx) +; SSE2-NEXT: movdqa %xmm1, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec384_v12i32_to_v2i192_factor6: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; SSE42-NEXT: movaps 32(%rdx), %xmm1 +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm2 +; SSE42-NEXT: movaps %xmm1, 32(%rcx) +; SSE42-NEXT: movdqa %xmm2, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v12i32_to_v2i192_factor6: ; AVX: # %bb.0: @@ -4611,19 +5220,33 @@ define void @vec384_v12i32_to_v2i192_factor6(ptr %in.vec.base.ptr, ptr %in.vec.b } define void @vec384_v12i32_to_v1i384_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec384_v12i32_to_v1i384_factor12: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 16(%rdx), %xmm0 -; SSE-NEXT: movaps 32(%rdx), %xmm2 -; SSE-NEXT: paddb (%rdx), %xmm1 -; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps %xmm2, 32(%rcx) -; SSE-NEXT: movdqa %xmm1, (%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec384_v12i32_to_v1i384_factor12: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE2-NEXT: movaps 16(%rdx), %xmm0 +; SSE2-NEXT: movaps 32(%rdx), %xmm2 +; SSE2-NEXT: paddb (%rdx), %xmm1 +; SSE2-NEXT: movaps %xmm0, 16(%rcx) +; SSE2-NEXT: movaps %xmm2, 32(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec384_v12i32_to_v1i384_factor12: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE42-NEXT: movaps 16(%rdx), %xmm0 +; SSE42-NEXT: movaps 32(%rdx), %xmm2 +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movaps %xmm0, 16(%rcx) +; SSE42-NEXT: movaps %xmm2, 32(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v12i32_to_v1i384_factor12: ; AVX: # %bb.0: @@ -4788,20 +5411,35 @@ define void @vec384_v6i64_to_v3i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi } define void @vec384_v6i64_to_v2i192_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec384_v6i64_to_v2i192_factor3: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero -; SSE-NEXT: movaps 32(%rdx), %xmm2 -; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: paddb 16(%rdx), %xmm1 -; SSE-NEXT: movaps %xmm2, 32(%rcx) -; SSE-NEXT: movdqa %xmm1, 16(%rcx) -; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec384_v6i64_to_v2i192_factor3: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; SSE2-NEXT: movaps 32(%rdx), %xmm2 +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm1 +; SSE2-NEXT: movaps %xmm2, 32(%rcx) +; SSE2-NEXT: movdqa %xmm1, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec384_v6i64_to_v2i192_factor3: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; SSE42-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; SSE42-NEXT: movaps 32(%rdx), %xmm2 +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm1 +; SSE42-NEXT: movaps %xmm2, 32(%rcx) +; SSE42-NEXT: movdqa %xmm1, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v6i64_to_v2i192_factor3: ; AVX: # %bb.0: @@ -5018,28 +5656,49 @@ define void @vec384_v3i128_to_v1i384_factor3(ptr %in.vec.base.ptr, ptr %in.vec.b } define void @vec512_v64i8_to_v32i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec512_v64i8_to_v32i16_factor2: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: paddb 16(%rsi), %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE-NEXT: paddb 16(%rdx), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm4 -; SSE-NEXT: paddb 48(%rdx), %xmm1 -; SSE-NEXT: paddb 32(%rdx), %xmm3 -; SSE-NEXT: movdqa %xmm3, 32(%rcx) -; SSE-NEXT: movdqa %xmm1, 48(%rcx) -; SSE-NEXT: movdqa %xmm4, (%rcx) -; SSE-NEXT: movdqa %xmm0, 16(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec512_v64i8_to_v32i16_factor2: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm4 +; SSE2-NEXT: paddb 48(%rdx), %xmm1 +; SSE2-NEXT: paddb 32(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 32(%rcx) +; SSE2-NEXT: movdqa %xmm1, 48(%rcx) +; SSE2-NEXT: movdqa %xmm4, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec512_v64i8_to_v32i16_factor2: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: movdqa 16(%rdi), %xmm1 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE42-NEXT: pxor %xmm3, %xmm3 +; SSE42-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; SSE42-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE42-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm4 +; SSE42-NEXT: paddb 48(%rdx), %xmm1 +; SSE42-NEXT: paddb 32(%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 32(%rcx) +; SSE42-NEXT: movdqa %xmm1, 48(%rcx) +; SSE42-NEXT: movdqa %xmm4, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v64i8_to_v32i16_factor2: ; AVX: # %bb.0: @@ -5110,29 +5769,50 @@ define void @vec512_v64i8_to_v32i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi } define void @vec512_v64i8_to_v16i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec512_v64i8_to_v16i32_factor4: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: paddb 16(%rdx), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm4 -; SSE-NEXT: paddb 48(%rdx), %xmm2 -; SSE-NEXT: paddb 32(%rdx), %xmm3 -; SSE-NEXT: movdqa %xmm3, 32(%rcx) -; SSE-NEXT: movdqa %xmm2, 48(%rcx) -; SSE-NEXT: movdqa %xmm4, (%rcx) -; SSE-NEXT: movdqa %xmm0, 16(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec512_v64i8_to_v16i32_factor4: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm4 +; SSE2-NEXT: paddb 48(%rdx), %xmm2 +; SSE2-NEXT: paddb 32(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, 48(%rcx) +; SSE2-NEXT: movdqa %xmm4, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec512_v64i8_to_v16i32_factor4: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb 48(%rdx), %xmm3 +; SSE42-NEXT: paddb 32(%rdx), %xmm2 +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: movdqa %xmm2, 32(%rcx) +; SSE42-NEXT: movdqa %xmm3, 48(%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v64i8_to_v16i32_factor4: ; AVX: # %bb.0: @@ -5202,30 +5882,52 @@ define void @vec512_v64i8_to_v16i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bi } define void @vec512_v64i8_to_v8i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec512_v64i8_to_v8i64_factor8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: paddb 16(%rdx), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm4 -; SSE-NEXT: paddb 48(%rdx), %xmm2 -; SSE-NEXT: paddb 32(%rdx), %xmm3 -; SSE-NEXT: movdqa %xmm3, 32(%rcx) -; SSE-NEXT: movdqa %xmm2, 48(%rcx) -; SSE-NEXT: movdqa %xmm4, (%rcx) -; SSE-NEXT: movdqa %xmm0, 16(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec512_v64i8_to_v8i64_factor8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm4 +; SSE2-NEXT: paddb 48(%rdx), %xmm2 +; SSE2-NEXT: paddb 32(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, 48(%rcx) +; SSE2-NEXT: movdqa %xmm4, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec512_v64i8_to_v8i64_factor8: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: movdqa %xmm0, %xmm3 +; SSE42-NEXT: psrlq $48, %xmm3 +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: psrld $16, %xmm0 +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb 48(%rdx), %xmm3 +; SSE42-NEXT: paddb 32(%rdx), %xmm2 +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: movdqa %xmm2, 32(%rcx) +; SSE42-NEXT: movdqa %xmm3, 48(%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v64i8_to_v8i64_factor8: ; AVX: # %bb.0: @@ -5641,28 +6343,49 @@ define void @vec512_v64i8_to_v1i512_factor64(ptr %in.vec.base.ptr, ptr %in.vec.b } define void @vec512_v32i16_to_v16i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec512_v32i16_to_v16i32_factor2: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: paddb 16(%rsi), %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: paddb 16(%rdx), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm4 -; SSE-NEXT: paddb 48(%rdx), %xmm1 -; SSE-NEXT: paddb 32(%rdx), %xmm3 -; SSE-NEXT: movdqa %xmm3, 32(%rcx) -; SSE-NEXT: movdqa %xmm1, 48(%rcx) -; SSE-NEXT: movdqa %xmm4, (%rcx) -; SSE-NEXT: movdqa %xmm0, 16(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec512_v32i16_to_v16i32_factor2: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm4 +; SSE2-NEXT: paddb 48(%rdx), %xmm1 +; SSE2-NEXT: paddb 32(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 32(%rcx) +; SSE2-NEXT: movdqa %xmm1, 48(%rcx) +; SSE2-NEXT: movdqa %xmm4, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec512_v32i16_to_v16i32_factor2: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: movdqa 16(%rdi), %xmm1 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE42-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE42-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm4 +; SSE42-NEXT: paddb 48(%rdx), %xmm1 +; SSE42-NEXT: paddb 32(%rdx), %xmm3 +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) +; SSE42-NEXT: movdqa %xmm1, 48(%rcx) +; SSE42-NEXT: movdqa %xmm4, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v32i16_to_v16i32_factor2: ; AVX: # %bb.0: @@ -5734,29 +6457,50 @@ define void @vec512_v32i16_to_v16i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.b } define void @vec512_v32i16_to_v8i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec512_v32i16_to_v8i64_factor4: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: paddb 16(%rdx), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm4 -; SSE-NEXT: paddb 48(%rdx), %xmm2 -; SSE-NEXT: paddb 32(%rdx), %xmm3 -; SSE-NEXT: movdqa %xmm3, 32(%rcx) -; SSE-NEXT: movdqa %xmm2, 48(%rcx) -; SSE-NEXT: movdqa %xmm4, (%rcx) -; SSE-NEXT: movdqa %xmm0, 16(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec512_v32i16_to_v8i64_factor4: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm4 +; SSE2-NEXT: paddb 48(%rdx), %xmm2 +; SSE2-NEXT: paddb 32(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, 48(%rcx) +; SSE2-NEXT: movdqa %xmm4, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec512_v32i16_to_v8i64_factor4: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb 48(%rdx), %xmm3 +; SSE42-NEXT: paddb 32(%rdx), %xmm2 +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: movdqa %xmm2, 32(%rcx) +; SSE42-NEXT: movdqa %xmm3, 48(%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v32i16_to_v8i64_factor4: ; AVX: # %bb.0: @@ -5828,27 +6572,49 @@ define void @vec512_v32i16_to_v8i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bi } define void @vec512_v32i16_to_v4i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec512_v32i16_to_v4i128_factor8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: paddb 16(%rdx), %xmm3 -; SSE-NEXT: paddb 48(%rdx), %xmm2 -; SSE-NEXT: paddb 32(%rdx), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rcx) -; SSE-NEXT: movdqa %xmm0, 32(%rcx) -; SSE-NEXT: movdqa %xmm2, 48(%rcx) -; SSE-NEXT: movdqa %xmm3, 16(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec512_v32i16_to_v4i128_factor8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: paddb 16(%rdx), %xmm3 +; SSE2-NEXT: paddb 48(%rdx), %xmm2 +; SSE2-NEXT: paddb 32(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, 48(%rcx) +; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec512_v32i16_to_v4i128_factor8: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: paddb 16(%rdx), %xmm3 +; SSE42-NEXT: paddb 48(%rdx), %xmm2 +; SSE42-NEXT: paddb 32(%rdx), %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 32(%rcx) +; SSE42-NEXT: movdqa %xmm2, 48(%rcx) +; SSE42-NEXT: movdqa %xmm3, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v32i16_to_v4i128_factor8: ; AVX: # %bb.0: @@ -5988,23 +6754,41 @@ define void @vec512_v32i16_to_v4i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b } define void @vec512_v32i16_to_v2i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec512_v32i16_to_v2i256_factor16: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: movaps 16(%rdx), %xmm2 -; SSE-NEXT: movaps 48(%rdx), %xmm3 -; SSE-NEXT: paddb 32(%rdx), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm1 -; SSE-NEXT: movaps %xmm3, 48(%rcx) -; SSE-NEXT: movaps %xmm2, 16(%rcx) -; SSE-NEXT: movdqa %xmm1, (%rcx) -; SSE-NEXT: movdqa %xmm0, 32(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec512_v32i16_to_v2i256_factor16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: movaps 16(%rdx), %xmm2 +; SSE2-NEXT: movaps 48(%rdx), %xmm3 +; SSE2-NEXT: paddb 32(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm1 +; SSE2-NEXT: movaps %xmm3, 48(%rcx) +; SSE2-NEXT: movaps %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 32(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec512_v32i16_to_v2i256_factor16: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: movaps 16(%rdx), %xmm2 +; SSE42-NEXT: movaps 48(%rdx), %xmm3 +; SSE42-NEXT: paddb 32(%rdx), %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movaps %xmm3, 48(%rcx) +; SSE42-NEXT: movaps %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 32(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v32i16_to_v2i256_factor16: ; AVX: # %bb.0: @@ -6077,20 +6861,36 @@ define void @vec512_v32i16_to_v2i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec. } define void @vec512_v32i16_to_v1i512_factor32(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec512_v32i16_to_v1i512_factor32: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: movaps 16(%rdx), %xmm1 -; SSE-NEXT: movaps 32(%rdx), %xmm2 -; SSE-NEXT: movaps 48(%rdx), %xmm3 -; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movaps %xmm2, 32(%rcx) -; SSE-NEXT: movaps %xmm3, 48(%rcx) -; SSE-NEXT: movaps %xmm1, 16(%rcx) -; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec512_v32i16_to_v1i512_factor32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movaps 16(%rdx), %xmm1 +; SSE2-NEXT: movaps 32(%rdx), %xmm2 +; SSE2-NEXT: movaps 48(%rdx), %xmm3 +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: movaps %xmm2, 32(%rcx) +; SSE2-NEXT: movaps %xmm3, 48(%rcx) +; SSE2-NEXT: movaps %xmm1, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec512_v32i16_to_v1i512_factor32: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; SSE42-NEXT: movaps 16(%rdx), %xmm0 +; SSE42-NEXT: movaps 32(%rdx), %xmm2 +; SSE42-NEXT: movaps 48(%rdx), %xmm3 +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movaps %xmm2, 32(%rcx) +; SSE42-NEXT: movaps %xmm3, 48(%rcx) +; SSE42-NEXT: movaps %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v32i16_to_v1i512_factor32: ; AVX: # %bb.0: @@ -6155,28 +6955,49 @@ define void @vec512_v32i16_to_v1i512_factor32(ptr %in.vec.base.ptr, ptr %in.vec. } define void @vec512_v16i32_to_v8i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec512_v16i32_to_v8i64_factor2: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: paddb 16(%rsi), %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: paddb 16(%rdx), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm4 -; SSE-NEXT: paddb 48(%rdx), %xmm1 -; SSE-NEXT: paddb 32(%rdx), %xmm3 -; SSE-NEXT: movdqa %xmm3, 32(%rcx) -; SSE-NEXT: movdqa %xmm1, 48(%rcx) -; SSE-NEXT: movdqa %xmm4, (%rcx) -; SSE-NEXT: movdqa %xmm0, 16(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec512_v16i32_to_v8i64_factor2: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm4 +; SSE2-NEXT: paddb 48(%rdx), %xmm1 +; SSE2-NEXT: paddb 32(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 32(%rcx) +; SSE2-NEXT: movdqa %xmm1, 48(%rcx) +; SSE2-NEXT: movdqa %xmm4, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec512_v16i32_to_v8i64_factor2: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: movdqa 16(%rdi), %xmm1 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero +; SSE42-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero +; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm4 +; SSE42-NEXT: paddb 48(%rdx), %xmm1 +; SSE42-NEXT: paddb 32(%rdx), %xmm3 +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) +; SSE42-NEXT: movdqa %xmm1, 48(%rcx) +; SSE42-NEXT: movdqa %xmm4, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v16i32_to_v8i64_factor2: ; AVX: # %bb.0: @@ -6255,29 +7076,51 @@ define void @vec512_v16i32_to_v8i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi } define void @vec512_v16i32_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec512_v16i32_to_v4i128_factor4: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: xorps %xmm3, %xmm3 -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] -; SSE-NEXT: paddb 16(%rdx), %xmm0 -; SSE-NEXT: paddb 32(%rdx), %xmm4 -; SSE-NEXT: paddb (%rdx), %xmm3 -; SSE-NEXT: paddb 48(%rdx), %xmm2 -; SSE-NEXT: movdqa %xmm2, 48(%rcx) -; SSE-NEXT: movdqa %xmm3, (%rcx) -; SSE-NEXT: movdqa %xmm4, 32(%rcx) -; SSE-NEXT: movdqa %xmm0, 16(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec512_v16i32_to_v4i128_factor4: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: xorps %xmm3, %xmm3 +; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: psrldq {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb 32(%rdx), %xmm4 +; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: paddb 48(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 48(%rcx) +; SSE2-NEXT: movdqa %xmm3, (%rcx) +; SSE2-NEXT: movdqa %xmm4, 32(%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec512_v16i32_to_v4i128_factor4: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pxor %xmm4, %xmm4 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3,4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7] +; SSE42-NEXT: paddb 16(%rdx), %xmm3 +; SSE42-NEXT: paddb 32(%rdx), %xmm2 +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: paddb 48(%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, 48(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: movdqa %xmm2, 32(%rcx) +; SSE42-NEXT: movdqa %xmm3, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v16i32_to_v4i128_factor4: ; AVX: # %bb.0: @@ -6401,24 +7244,43 @@ define void @vec512_v16i32_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.b } define void @vec512_v16i32_to_v2i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec512_v16i32_to_v2i256_factor8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: xorps %xmm2, %xmm2 -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] -; SSE-NEXT: movaps 16(%rdx), %xmm1 -; SSE-NEXT: movaps 48(%rdx), %xmm3 -; SSE-NEXT: paddb 32(%rdx), %xmm0 -; SSE-NEXT: paddb (%rdx), %xmm2 -; SSE-NEXT: movaps %xmm3, 48(%rcx) -; SSE-NEXT: movaps %xmm1, 16(%rcx) -; SSE-NEXT: movdqa %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm0, 32(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec512_v16i32_to_v2i256_factor8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] +; SSE2-NEXT: movaps 16(%rdx), %xmm1 +; SSE2-NEXT: movaps 48(%rdx), %xmm3 +; SSE2-NEXT: paddb 32(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: movaps %xmm3, 48(%rcx) +; SSE2-NEXT: movaps %xmm1, 16(%rcx) +; SSE2-NEXT: movdqa %xmm2, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 32(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec512_v16i32_to_v2i256_factor8: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE42-NEXT: movaps 16(%rdx), %xmm1 +; SSE42-NEXT: movaps 48(%rdx), %xmm3 +; SSE42-NEXT: paddb 32(%rdx), %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm2 +; SSE42-NEXT: movaps %xmm3, 48(%rcx) +; SSE42-NEXT: movaps %xmm1, 16(%rcx) +; SSE42-NEXT: movdqa %xmm2, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 32(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v16i32_to_v2i256_factor8: ; AVX: # %bb.0: @@ -6535,21 +7397,37 @@ define void @vec512_v16i32_to_v2i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b } define void @vec512_v16i32_to_v1i512_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec512_v16i32_to_v1i512_factor16: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 16(%rdx), %xmm0 -; SSE-NEXT: movaps 32(%rdx), %xmm2 -; SSE-NEXT: movaps 48(%rdx), %xmm3 -; SSE-NEXT: paddb (%rdx), %xmm1 -; SSE-NEXT: movaps %xmm2, 32(%rcx) -; SSE-NEXT: movaps %xmm3, 48(%rcx) -; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movdqa %xmm1, (%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec512_v16i32_to_v1i512_factor16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE2-NEXT: movaps 16(%rdx), %xmm0 +; SSE2-NEXT: movaps 32(%rdx), %xmm2 +; SSE2-NEXT: movaps 48(%rdx), %xmm3 +; SSE2-NEXT: paddb (%rdx), %xmm1 +; SSE2-NEXT: movaps %xmm2, 32(%rcx) +; SSE2-NEXT: movaps %xmm3, 48(%rcx) +; SSE2-NEXT: movaps %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec512_v16i32_to_v1i512_factor16: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE42-NEXT: movaps 16(%rdx), %xmm0 +; SSE42-NEXT: movaps 32(%rdx), %xmm2 +; SSE42-NEXT: movaps 48(%rdx), %xmm3 +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movaps %xmm2, 32(%rcx) +; SSE42-NEXT: movaps %xmm3, 48(%rcx) +; SSE42-NEXT: movaps %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v16i32_to_v1i512_factor16: ; AVX: # %bb.0: @@ -7130,3 +8008,4 @@ define void @vec512_v2i256_to_v1i512_factor2(ptr %in.vec.base.ptr, ptr %in.vec.b ; FALLBACK6: {{.*}} ; FALLBACK7: {{.*}} ; FALLBACK8: {{.*}} +; FALLBACK9: {{.*}} -- 2.7.4