From 778a7df50e416a4b198d4835cff71d4074a75534 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Thu, 29 Dec 2022 02:09:57 +0300 Subject: [PATCH] [NFC][Codegen][X86] Add exhaustive-ish test coverage for ZERO_EXTEND_VECTOR_INREG It should be possible to deduplicate AVX2 and AVX512F checklines, but i'm not sure which combination of check prefixes would do that. https://godbolt.org/z/sndT9n1nz --- llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll | 7571 +++++++++++++++++++++ 1 file changed, 7571 insertions(+) create mode 100644 llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll new file mode 100644 index 0000000..ac5f943 --- /dev/null +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll @@ -0,0 +1,7571 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,FALLBACK0 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1-ONLY,FALLBACK1 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW,FALLBACK2 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST-PERLANE,FALLBACK3 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST,FALLBACK4 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512F,AVX512F-SLOW,FALLBACK5 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512F,AVX512F-FAST,FALLBACK6 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW,FALLBACK7 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST,FALLBACK8 + +define void @vec16_v2i8_to_v1i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec16_v2i8_to_v1i16_factor2: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec16_v2i8_to_v1i16_factor2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec16_v2i8_to_v1i16_factor2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec16_v2i8_to_v1i16_factor2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec16_v2i8_to_v1i16_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <2 x i32> + %zextd.vec = shufflevector <2 x i8> %in.vec.trunc, <2 x i8> zeroinitializer, <2 x i32> + %out.bytevec.padded = shufflevector <2 x i8> %zextd.vec, <2 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec32_v4i8_to_v2i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec32_v4i8_to_v2i16_factor2: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec32_v4i8_to_v2i16_factor2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec32_v4i8_to_v2i16_factor2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec32_v4i8_to_v2i16_factor2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec32_v4i8_to_v2i16_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <4 x i32> + %zextd.vec = shufflevector <4 x i8> %in.vec.trunc, <4 x i8> zeroinitializer, <4 x i32> + %out.bytevec.padded = shufflevector <4 x i8> %zextd.vec, <4 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec32_v4i8_to_v1i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec32_v4i8_to_v1i32_factor4: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec32_v4i8_to_v1i32_factor4: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec32_v4i8_to_v1i32_factor4: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec32_v4i8_to_v1i32_factor4: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec32_v4i8_to_v1i32_factor4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <4 x i32> + %zextd.vec = shufflevector <4 x i8> %in.vec.trunc, <4 x i8> zeroinitializer, <4 x i32> + %out.bytevec.padded = shufflevector <4 x i8> %zextd.vec, <4 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec32_v2i16_to_v1i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec32_v2i16_to_v1i32_factor2: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec32_v2i16_to_v1i32_factor2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec32_v2i16_to_v1i32_factor2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec32_v2i16_to_v1i32_factor2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec32_v2i16_to_v1i32_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <4 x i32> + %in.vec.cast = bitcast <4 x i8> %in.vec.trunc to <2 x i16> + %zextd.vec = shufflevector <2 x i16> %in.vec.cast, <2 x i16> zeroinitializer, <2 x i32> + %out.bytevec = bitcast <2 x i16> %zextd.vec to <4 x i8> + %out.bytevec.padded = shufflevector <4 x i8> %out.bytevec, <4 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec64_v8i8_to_v4i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec64_v8i8_to_v4i16_factor2: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec64_v8i8_to_v4i16_factor2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec64_v8i8_to_v4i16_factor2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec64_v8i8_to_v4i16_factor2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec64_v8i8_to_v4i16_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> + %zextd.vec = shufflevector <8 x i8> %in.vec.trunc, <8 x i8> zeroinitializer, <8 x i32> + %out.bytevec.padded = shufflevector <8 x i8> %zextd.vec, <8 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec64_v8i8_to_v2i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec64_v8i8_to_v2i32_factor4: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec64_v8i8_to_v2i32_factor4: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec64_v8i8_to_v2i32_factor4: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec64_v8i8_to_v2i32_factor4: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec64_v8i8_to_v2i32_factor4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> + %zextd.vec = shufflevector <8 x i8> %in.vec.trunc, <8 x i8> zeroinitializer, <8 x i32> + %out.bytevec.padded = shufflevector <8 x i8> %zextd.vec, <8 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec64_v8i8_to_v1i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec64_v8i8_to_v1i64_factor8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec64_v8i8_to_v1i64_factor8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec64_v8i8_to_v1i64_factor8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec64_v8i8_to_v1i64_factor8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec64_v8i8_to_v1i64_factor8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> + %zextd.vec = shufflevector <8 x i8> %in.vec.trunc, <8 x i8> zeroinitializer, <8 x i32> + %out.bytevec.padded = shufflevector <8 x i8> %zextd.vec, <8 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec64_v4i16_to_v2i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec64_v4i16_to_v2i32_factor2: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec64_v4i16_to_v2i32_factor2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec64_v4i16_to_v2i32_factor2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec64_v4i16_to_v2i32_factor2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec64_v4i16_to_v2i32_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> + %in.vec.cast = bitcast <8 x i8> %in.vec.trunc to <4 x i16> + %zextd.vec = shufflevector <4 x i16> %in.vec.cast, <4 x i16> zeroinitializer, <4 x i32> + %out.bytevec = bitcast <4 x i16> %zextd.vec to <8 x i8> + %out.bytevec.padded = shufflevector <8 x i8> %out.bytevec, <8 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec64_v4i16_to_v1i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec64_v4i16_to_v1i64_factor4: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec64_v4i16_to_v1i64_factor4: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec64_v4i16_to_v1i64_factor4: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec64_v4i16_to_v1i64_factor4: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec64_v4i16_to_v1i64_factor4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> + %in.vec.cast = bitcast <8 x i8> %in.vec.trunc to <4 x i16> + %zextd.vec = shufflevector <4 x i16> %in.vec.cast, <4 x i16> zeroinitializer, <4 x i32> + %out.bytevec = bitcast <4 x i16> %zextd.vec to <8 x i8> + %out.bytevec.padded = shufflevector <8 x i8> %out.bytevec, <8 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec64_v2i32_to_v1i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec64_v2i32_to_v1i64_factor2: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec64_v2i32_to_v1i64_factor2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec64_v2i32_to_v1i64_factor2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec64_v2i32_to_v1i64_factor2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec64_v2i32_to_v1i64_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <8 x i32> + %in.vec.cast = bitcast <8 x i8> %in.vec.trunc to <2 x i32> + %zextd.vec = shufflevector <2 x i32> %in.vec.cast, <2 x i32> zeroinitializer, <2 x i32> + %out.bytevec = bitcast <2 x i32> %zextd.vec to <8 x i8> + %out.bytevec.padded = shufflevector <8 x i8> %out.bytevec, <8 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec128_v16i8_to_v8i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec128_v16i8_to_v8i16_factor2: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec128_v16i8_to_v8i16_factor2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec128_v16i8_to_v8i16_factor2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec128_v16i8_to_v8i16_factor2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec128_v16i8_to_v8i16_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> + %zextd.vec = shufflevector <16 x i8> %in.vec.trunc, <16 x i8> zeroinitializer, <16 x i32> + %out.bytevec.padded = shufflevector <16 x i8> %zextd.vec, <16 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec128_v16i8_to_v4i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec128_v16i8_to_v4i32_factor4: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec128_v16i8_to_v4i32_factor4: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec128_v16i8_to_v4i32_factor4: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec128_v16i8_to_v4i32_factor4: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec128_v16i8_to_v4i32_factor4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> + %zextd.vec = shufflevector <16 x i8> %in.vec.trunc, <16 x i8> zeroinitializer, <16 x i32> + %out.bytevec.padded = shufflevector <16 x i8> %zextd.vec, <16 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec128_v16i8_to_v2i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec128_v16i8_to_v2i64_factor8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec128_v16i8_to_v2i64_factor8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec128_v16i8_to_v2i64_factor8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec128_v16i8_to_v2i64_factor8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec128_v16i8_to_v2i64_factor8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> + %zextd.vec = shufflevector <16 x i8> %in.vec.trunc, <16 x i8> zeroinitializer, <16 x i32> + %out.bytevec.padded = shufflevector <16 x i8> %zextd.vec, <16 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec128_v16i8_to_v1i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec128_v16i8_to_v1i128_factor16: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec128_v16i8_to_v1i128_factor16: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec128_v16i8_to_v1i128_factor16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec128_v16i8_to_v1i128_factor16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec128_v16i8_to_v1i128_factor16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> + %zextd.vec = shufflevector <16 x i8> %in.vec.trunc, <16 x i8> zeroinitializer, <16 x i32> + %out.bytevec.padded = shufflevector <16 x i8> %zextd.vec, <16 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec128_v8i16_to_v4i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec128_v8i16_to_v4i32_factor2: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec128_v8i16_to_v4i32_factor2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec128_v8i16_to_v4i32_factor2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec128_v8i16_to_v4i32_factor2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec128_v8i16_to_v4i32_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> + %in.vec.cast = bitcast <16 x i8> %in.vec.trunc to <8 x i16> + %zextd.vec = shufflevector <8 x i16> %in.vec.cast, <8 x i16> zeroinitializer, <8 x i32> + %out.bytevec = bitcast <8 x i16> %zextd.vec to <16 x i8> + %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec128_v8i16_to_v2i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec128_v8i16_to_v2i64_factor4: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec128_v8i16_to_v2i64_factor4: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec128_v8i16_to_v2i64_factor4: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec128_v8i16_to_v2i64_factor4: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec128_v8i16_to_v2i64_factor4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> + %in.vec.cast = bitcast <16 x i8> %in.vec.trunc to <8 x i16> + %zextd.vec = shufflevector <8 x i16> %in.vec.cast, <8 x i16> zeroinitializer, <8 x i32> + %out.bytevec = bitcast <8 x i16> %zextd.vec to <16 x i8> + %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec128_v8i16_to_v1i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec128_v8i16_to_v1i128_factor8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec128_v8i16_to_v1i128_factor8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec128_v8i16_to_v1i128_factor8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec128_v8i16_to_v1i128_factor8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec128_v8i16_to_v1i128_factor8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> + %in.vec.cast = bitcast <16 x i8> %in.vec.trunc to <8 x i16> + %zextd.vec = shufflevector <8 x i16> %in.vec.cast, <8 x i16> zeroinitializer, <8 x i32> + %out.bytevec = bitcast <8 x i16> %zextd.vec to <16 x i8> + %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec128_v4i32_to_v2i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec128_v4i32_to_v2i64_factor2: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec128_v4i32_to_v2i64_factor2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec128_v4i32_to_v2i64_factor2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec128_v4i32_to_v2i64_factor2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec128_v4i32_to_v2i64_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> + %in.vec.cast = bitcast <16 x i8> %in.vec.trunc to <4 x i32> + %zextd.vec = shufflevector <4 x i32> %in.vec.cast, <4 x i32> zeroinitializer, <4 x i32> + %out.bytevec = bitcast <4 x i32> %zextd.vec to <16 x i8> + %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec128_v4i32_to_v1i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec128_v4i32_to_v1i128_factor4: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE-NEXT: paddb (%rdx), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec128_v4i32_to_v1i128_factor4: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec128_v4i32_to_v1i128_factor4: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec128_v4i32_to_v1i128_factor4: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec128_v4i32_to_v1i128_factor4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> + %in.vec.cast = bitcast <16 x i8> %in.vec.trunc to <4 x i32> + %zextd.vec = shufflevector <4 x i32> %in.vec.cast, <4 x i32> zeroinitializer, <4 x i32> + %out.bytevec = bitcast <4 x i32> %zextd.vec to <16 x i8> + %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec128_v2i64_to_v1i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec128_v2i64_to_v1i128_factor2: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec128_v2i64_to_v1i128_factor2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec128_v2i64_to_v1i128_factor2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec128_v2i64_to_v1i128_factor2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec128_v2i64_to_v1i128_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <16 x i32> + %in.vec.cast = bitcast <16 x i8> %in.vec.trunc to <2 x i64> + %zextd.vec = shufflevector <2 x i64> %in.vec.cast, <2 x i64> zeroinitializer, <2 x i32> + %out.bytevec = bitcast <2 x i64> %zextd.vec to <16 x i8> + %out.bytevec.padded = shufflevector <16 x i8> %out.bytevec, <16 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec256_v32i8_to_v16i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec256_v32i8_to_v16i16_factor2: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm2 +; SSE-NEXT: movdqa %xmm2, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec256_v32i8_to_v16i16_factor2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec256_v32i8_to_v16i16_factor2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec256_v32i8_to_v16i16_factor2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec256_v32i8_to_v16i16_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> + %zextd.vec = shufflevector <32 x i8> %in.vec.trunc, <32 x i8> zeroinitializer, <32 x i32> + %out.bytevec.padded = shufflevector <32 x i8> %zextd.vec, <32 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec256_v32i8_to_v8i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec256_v32i8_to_v8i32_factor4: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm2 +; SSE-NEXT: movdqa %xmm2, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec256_v32i8_to_v8i32_factor4: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec256_v32i8_to_v8i32_factor4: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec256_v32i8_to_v8i32_factor4: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec256_v32i8_to_v8i32_factor4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> + %zextd.vec = shufflevector <32 x i8> %in.vec.trunc, <32 x i8> zeroinitializer, <32 x i32> + %out.bytevec.padded = shufflevector <32 x i8> %zextd.vec, <32 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec256_v32i8_to_v4i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec256_v32i8_to_v4i64_factor8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm2 +; SSE-NEXT: movdqa %xmm2, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec256_v32i8_to_v4i64_factor8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec256_v32i8_to_v4i64_factor8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec256_v32i8_to_v4i64_factor8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec256_v32i8_to_v4i64_factor8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> + %zextd.vec = shufflevector <32 x i8> %in.vec.trunc, <32 x i8> zeroinitializer, <32 x i32> + %out.bytevec.padded = shufflevector <32 x i8> %zextd.vec, <32 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec256_v32i8_to_v2i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec256_v32i8_to_v2i128_factor16: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec256_v32i8_to_v2i128_factor16: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec256_v32i8_to_v2i128_factor16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec256_v32i8_to_v2i128_factor16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec256_v32i8_to_v2i128_factor16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> + %zextd.vec = shufflevector <32 x i8> %in.vec.trunc, <32 x i8> zeroinitializer, <32 x i32> + %out.bytevec.padded = shufflevector <32 x i8> %zextd.vec, <32 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec256_v32i8_to_v1i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec256_v32i8_to_v1i256_factor32: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: movaps 16(%rdx), %xmm1 +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec256_v32i8_to_v1i256_factor32: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 16(%rdx), %xmm1 +; AVX-NEXT: vmovaps %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec256_v32i8_to_v1i256_factor32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec256_v32i8_to_v1i256_factor32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec256_v32i8_to_v1i256_factor32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> + %zextd.vec = shufflevector <32 x i8> %in.vec.trunc, <32 x i8> zeroinitializer, <32 x i32> + %out.bytevec.padded = shufflevector <32 x i8> %zextd.vec, <32 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec256_v16i16_to_v8i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec256_v16i16_to_v8i32_factor2: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm2 +; SSE-NEXT: movdqa %xmm2, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec256_v16i16_to_v8i32_factor2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec256_v16i16_to_v8i32_factor2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec256_v16i16_to_v8i32_factor2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec256_v16i16_to_v8i32_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> + %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <16 x i16> + %zextd.vec = shufflevector <16 x i16> %in.vec.cast, <16 x i16> zeroinitializer, <16 x i32> + %out.bytevec = bitcast <16 x i16> %zextd.vec to <32 x i8> + %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec256_v16i16_to_v4i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec256_v16i16_to_v4i64_factor4: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm2 +; SSE-NEXT: movdqa %xmm2, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec256_v16i16_to_v4i64_factor4: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec256_v16i16_to_v4i64_factor4: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec256_v16i16_to_v4i64_factor4: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec256_v16i16_to_v4i64_factor4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> + %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <16 x i16> + %zextd.vec = shufflevector <16 x i16> %in.vec.cast, <16 x i16> zeroinitializer, <16 x i32> + %out.bytevec = bitcast <16 x i16> %zextd.vec to <32 x i8> + %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec256_v16i16_to_v2i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec256_v16i16_to_v2i128_factor8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec256_v16i16_to_v2i128_factor8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec256_v16i16_to_v2i128_factor8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec256_v16i16_to_v2i128_factor8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec256_v16i16_to_v2i128_factor8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> + %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <16 x i16> + %zextd.vec = shufflevector <16 x i16> %in.vec.cast, <16 x i16> zeroinitializer, <16 x i32> + %out.bytevec = bitcast <16 x i16> %zextd.vec to <32 x i8> + %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec256_v16i16_to_v1i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec256_v16i16_to_v1i256_factor16: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: movaps 16(%rdx), %xmm1 +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec256_v16i16_to_v1i256_factor16: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 16(%rdx), %xmm1 +; AVX-NEXT: vmovaps %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec256_v16i16_to_v1i256_factor16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec256_v16i16_to_v1i256_factor16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec256_v16i16_to_v1i256_factor16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> + %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <16 x i16> + %zextd.vec = shufflevector <16 x i16> %in.vec.cast, <16 x i16> zeroinitializer, <16 x i32> + %out.bytevec = bitcast <16 x i16> %zextd.vec to <32 x i8> + %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec256_v8i32_to_v4i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec256_v8i32_to_v4i64_factor2: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm2 +; SSE-NEXT: movdqa %xmm2, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec256_v8i32_to_v4i64_factor2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec256_v8i32_to_v4i64_factor2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec256_v8i32_to_v4i64_factor2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec256_v8i32_to_v4i64_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> + %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <8 x i32> + %zextd.vec = shufflevector <8 x i32> %in.vec.cast, <8 x i32> zeroinitializer, <8 x i32> + %out.bytevec = bitcast <8 x i32> %zextd.vec to <32 x i8> + %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec256_v8i32_to_v2i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec256_v8i32_to_v2i128_factor4: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: xorps %xmm2, %xmm2 +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm2 +; SSE-NEXT: movdqa %xmm2, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec256_v8i32_to_v2i128_factor4: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-SLOW-LABEL: vec256_v8i32_to_v2i128_factor4: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vec256_v8i32_to_v2i128_factor4: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX2-FAST-LABEL: vec256_v8i32_to_v2i128_factor4: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,u,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: vec256_v8i32_to_v2i128_factor4: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: movb $17, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpexpandd %ymm0, %ymm0 {%k1} {z} +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec256_v8i32_to_v2i128_factor4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: movb $17, %al +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpexpandd %ymm0, %ymm0 {%k1} {z} +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> + %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <8 x i32> + %zextd.vec = shufflevector <8 x i32> %in.vec.cast, <8 x i32> zeroinitializer, <8 x i32> + %out.bytevec = bitcast <8 x i32> %zextd.vec to <32 x i8> + %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec256_v8i32_to_v1i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec256_v8i32_to_v1i256_factor8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE-NEXT: movaps 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm1 +; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movdqa %xmm1, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec256_v8i32_to_v1i256_factor8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 16(%rdx), %xmm1 +; AVX-NEXT: vmovaps %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec256_v8i32_to_v1i256_factor8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec256_v8i32_to_v1i256_factor8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec256_v8i32_to_v1i256_factor8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> + %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <8 x i32> + %zextd.vec = shufflevector <8 x i32> %in.vec.cast, <8 x i32> zeroinitializer, <8 x i32> + %out.bytevec = bitcast <8 x i32> %zextd.vec to <32 x i8> + %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec256_v4i64_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec256_v4i64_to_v2i128_factor2: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec256_v4i64_to_v2i128_factor2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[3] +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: vec256_v4i64_to_v2i128_factor2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec256_v4i64_to_v2i128_factor2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: movb $5, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpexpandq %ymm0, %ymm0 {%k1} {z} +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec256_v4i64_to_v2i128_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: movb $5, %al +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpexpandq %ymm0, %ymm0 {%k1} {z} +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> + %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <4 x i64> + %zextd.vec = shufflevector <4 x i64> %in.vec.cast, <4 x i64> zeroinitializer, <4 x i32> + %out.bytevec = bitcast <4 x i64> %zextd.vec to <32 x i8> + %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec256_v4i64_to_v1i256_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec256_v4i64_to_v1i256_factor4: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; SSE-NEXT: movaps 16(%rdx), %xmm1 +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec256_v4i64_to_v1i256_factor4: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 16(%rdx), %xmm1 +; AVX-NEXT: vmovaps %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec256_v4i64_to_v1i256_factor4: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec256_v4i64_to_v1i256_factor4: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec256_v4i64_to_v1i256_factor4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> + %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <4 x i64> + %zextd.vec = shufflevector <4 x i64> %in.vec.cast, <4 x i64> zeroinitializer, <4 x i32> + %out.bytevec = bitcast <4 x i64> %zextd.vec to <32 x i8> + %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec256_v2i128_to_v1i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec256_v2i128_to_v1i256_factor2: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movaps 16(%rdx), %xmm1 +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec256_v2i128_to_v1i256_factor2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 16(%rdx), %xmm1 +; AVX-NEXT: vmovaps %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec256_v2i128_to_v1i256_factor2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec256_v2i128_to_v1i256_factor2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec256_v2i128_to_v1i256_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <32 x i32> + %in.vec.cast = bitcast <32 x i8> %in.vec.trunc to <2 x i128> + %zextd.vec = shufflevector <2 x i128> %in.vec.cast, <2 x i128> zeroinitializer, <2 x i32> + %out.bytevec = bitcast <2 x i128> %zextd.vec to <32 x i8> + %out.bytevec.padded = shufflevector <32 x i8> %out.bytevec, <32 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec384_v48i8_to_v24i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec384_v48i8_to_v24i16_factor2: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: paddb 16(%rsi), %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm3 +; SSE-NEXT: paddb 32(%rdx), %xmm1 +; SSE-NEXT: movdqa %xmm1, 32(%rcx) +; SSE-NEXT: movdqa %xmm3, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec384_v48i8_to_v24i16_factor2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec384_v48i8_to_v24i16_factor2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec384_v48i8_to_v24i16_factor2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec384_v48i8_to_v24i16_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> + %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> + %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec384_v48i8_to_v16i24_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec384_v48i8_to_v16i24_factor3: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,5] +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,6] +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: paddb 32(%rdx), %xmm2 +; SSE-NEXT: paddb 16(%rdx), %xmm1 +; SSE-NEXT: movdqa %xmm1, 16(%rcx) +; SSE-NEXT: movdqa %xmm2, 32(%rcx) +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec384_v48i8_to_v16i24_factor3: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0],zero,zero,xmm0[1],zero,zero,xmm0[2],zero,zero,xmm0[3],zero,zero,xmm0[4],zero,zero,xmm0[5] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[6],zero,zero,xmm0[7],zero,zero,xmm0[8],zero,zero,xmm0[9],zero,zero,xmm0[10],zero +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[11],zero,zero,xmm0[12],zero,zero,xmm0[13],zero,zero,xmm0[14],zero,zero,xmm0[15],zero,zero +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec384_v48i8_to_v16i24_factor3: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0],zero,zero,ymm1[1],zero,zero,ymm1[2],zero,zero,ymm1[3],zero,zero,ymm1[4],zero,zero,ymm1[5],zero,zero,ymm1[22],zero,zero,ymm1[23],zero,zero,ymm1[24],zero,zero,ymm1[25],zero,zero,ymm1[26],zero +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[11],zero,zero,xmm0[12],zero,zero,xmm0[13],zero,zero,xmm0[14],zero,zero,xmm0[15],zero,zero +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec384_v48i8_to_v16i24_factor3: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0],zero,zero,ymm1[1],zero,zero,ymm1[2],zero,zero,ymm1[3],zero,zero,ymm1[4],zero,zero,ymm1[5],zero,zero,ymm1[22],zero,zero,ymm1[23],zero,zero,ymm1[24],zero,zero,ymm1[25],zero,zero,ymm1[26],zero +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[11],zero,zero,xmm0[12],zero,zero,xmm0[13],zero,zero,xmm0[14],zero,zero,xmm0[15],zero,zero +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec384_v48i8_to_v16i24_factor3: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm0[11],zero,zero,xmm0[12],zero,zero,xmm0[13],zero,zero,xmm0[14],zero,zero,xmm0[15],zero,zero +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,u,1,1,u,2,2,u,3,3,u,4,4,u,5> +; AVX512BW-NEXT: vpermw %ymm0, %ymm2, %ymm0 +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> + %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> + %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec384_v48i8_to_v12i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec384_v48i8_to_v12i32_factor4: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm3 +; SSE-NEXT: paddb 32(%rdx), %xmm2 +; SSE-NEXT: movdqa %xmm2, 32(%rcx) +; SSE-NEXT: movdqa %xmm3, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec384_v48i8_to_v12i32_factor4: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: retq +; +; AVX2-SLOW-LABEL: vec384_v48i8_to_v12i32_factor4: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vec384_v48i8_to_v12i32_factor4: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX2-FAST-LABEL: vec384_v48i8_to_v12i32_factor4: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: vec384_v48i8_to_v12i32_factor4: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec384_v48i8_to_v12i32_factor4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> + %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> + %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec384_v48i8_to_v8i48_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec384_v48i8_to_v8i48_factor6: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm2 +; SSE-NEXT: paddb 32(%rdx), %xmm1 +; SSE-NEXT: movdqa %xmm1, 32(%rcx) +; SSE-NEXT: movdqa %xmm2, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec384_v48i8_to_v8i48_factor6: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,xmm0[5],zero +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec384_v48i8_to_v8i48_factor6: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,u,3,u,2,u,1,u,4,u,5,u,6,u,5,u] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec384_v48i8_to_v8i48_factor6: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,u,3,u,2,u,1,u,4,u,5,u,6,u,5,u] +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec384_v48i8_to_v8i48_factor6: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,1,u,u,2,u,u,3,u,u,4,u,u,5> +; AVX512BW-NEXT: vpermw %ymm0, %ymm2, %ymm0 +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> + %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> + %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec384_v48i8_to_v6i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec384_v48i8_to_v6i64_factor8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm3 +; SSE-NEXT: paddb 32(%rdx), %xmm2 +; SSE-NEXT: movdqa %xmm2, 32(%rcx) +; SSE-NEXT: movdqa %xmm3, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec384_v48i8_to_v6i64_factor8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: retq +; +; AVX2-SLOW-LABEL: vec384_v48i8_to_v6i64_factor8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vec384_v48i8_to_v6i64_factor8: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX2-FAST-LABEL: vec384_v48i8_to_v6i64_factor8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: vec384_v48i8_to_v6i64_factor8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec384_v48i8_to_v6i64_factor8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> + %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> + %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec384_v48i8_to_v4i96_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec384_v48i8_to_v4i96_factor12: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: paddb 16(%rdx), %xmm2 +; SSE-NEXT: paddb 32(%rdx), %xmm1 +; SSE-NEXT: movdqa %xmm1, 32(%rcx) +; SSE-NEXT: movdqa %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec384_v48i8_to_v4i96_factor12: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec384_v48i8_to_v4i96_factor12: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec384_v48i8_to_v4i96_factor12: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec384_v48i8_to_v4i96_factor12: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> + %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> + %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec384_v48i8_to_v3i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec384_v48i8_to_v3i128_factor16: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb 32(%rdx), %xmm2 +; SSE-NEXT: paddb (%rdx), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rcx) +; SSE-NEXT: movdqa %xmm2, 32(%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec384_v48i8_to_v3i128_factor16: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: retq +; +; AVX2-SLOW-LABEL: vec384_v48i8_to_v3i128_factor16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vec384_v48i8_to_v3i128_factor16: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX2-FAST-LABEL: vec384_v48i8_to_v3i128_factor16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: vec384_v48i8_to_v3i128_factor16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec384_v48i8_to_v3i128_factor16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> + %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> + %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec384_v48i8_to_v2i192_factor24(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec384_v48i8_to_v2i192_factor24: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE-NEXT: movaps 32(%rdx), %xmm2 +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm1 +; SSE-NEXT: movaps %xmm2, 32(%rcx) +; SSE-NEXT: movdqa %xmm1, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec384_v48i8_to_v2i192_factor24: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vmovaps 32(%rdx), %ymm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: vec384_v48i8_to_v2i192_factor24: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec384_v48i8_to_v2i192_factor24: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec384_v48i8_to_v2i192_factor24: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> + %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> + %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec384_v48i8_to_v1i384_factor48(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec384_v48i8_to_v1i384_factor48: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: movaps 16(%rdx), %xmm1 +; SSE-NEXT: movaps 32(%rdx), %xmm2 +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movaps %xmm2, 32(%rcx) +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec384_v48i8_to_v1i384_factor48: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 16(%rdx), %xmm2 +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) +; AVX-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: vec384_v48i8_to_v1i384_factor48: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec384_v48i8_to_v1i384_factor48: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec384_v48i8_to_v1i384_factor48: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> + %zextd.vec = shufflevector <48 x i8> %in.vec.trunc, <48 x i8> zeroinitializer, <48 x i32> + %out.bytevec.padded = shufflevector <48 x i8> %zextd.vec, <48 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec384_v24i16_to_v12i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec384_v24i16_to_v12i32_factor2: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: paddb 16(%rsi), %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm3 +; SSE-NEXT: paddb 32(%rdx), %xmm1 +; SSE-NEXT: movdqa %xmm1, 32(%rcx) +; SSE-NEXT: movdqa %xmm3, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec384_v24i16_to_v12i32_factor2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec384_v24i16_to_v12i32_factor2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec384_v24i16_to_v12i32_factor2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec384_v24i16_to_v12i32_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> + %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16> + %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> + %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8> + %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec384_v24i16_to_v8i48_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec384_v24i16_to_v8i48_factor3: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm2 +; SSE-NEXT: paddb 32(%rdx), %xmm1 +; SSE-NEXT: movdqa %xmm1, 32(%rcx) +; SSE-NEXT: movdqa %xmm2, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec384_v24i16_to_v8i48_factor3: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2] +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm2 +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: retq +; +; AVX2-SLOW-LABEL: vec384_v24i16_to_v8i48_factor3: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,u,1,1,u,2,2> +; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v8i48_factor3: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,u,1,1,u,2,2> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX2-FAST-LABEL: vec384_v24i16_to_v8i48_factor3: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,u,1,1,u,2,2> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-SLOW-LABEL: vec384_v24i16_to_v8i48_factor3: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,u,1,1,u,2,2> +; AVX512F-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq +; +; AVX512F-FAST-LABEL: vec384_v24i16_to_v8i48_factor3: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,u,1,1,u,2,2> +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq +; +; AVX512BW-SLOW-LABEL: vec384_v24i16_to_v8i48_factor3: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,17,4,5,18,7,8,19,10,11,20,13,14,21] +; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-SLOW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: vec384_v24i16_to_v8i48_factor3: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,17,4,5,18,7,8,19,10,11,20,13,14,21] +; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-FAST-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> + %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16> + %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> + %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8> + %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec384_v24i16_to_v6i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec384_v24i16_to_v6i64_factor4: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm3 +; SSE-NEXT: paddb 32(%rdx), %xmm2 +; SSE-NEXT: movdqa %xmm2, 32(%rcx) +; SSE-NEXT: movdqa %xmm3, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec384_v24i16_to_v6i64_factor4: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: retq +; +; AVX2-SLOW-LABEL: vec384_v24i16_to_v6i64_factor4: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v6i64_factor4: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX2-FAST-LABEL: vec384_v24i16_to_v6i64_factor4: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: vec384_v24i16_to_v6i64_factor4: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec384_v24i16_to_v6i64_factor4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> + %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16> + %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> + %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8> + %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec384_v24i16_to_v4i96_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec384_v24i16_to_v4i96_factor6: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm2 +; SSE-NEXT: paddb 32(%rdx), %xmm1 +; SSE-NEXT: movdqa %xmm1, 32(%rcx) +; SSE-NEXT: movdqa %xmm2, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec384_v24i16_to_v4i96_factor6: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,3],zero,zero +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6,7] +; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4,5,6,7] +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: retq +; +; AVX2-SLOW-LABEL: vec384_v24i16_to_v4i96_factor6: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v4i96_factor6: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX2-FAST-LABEL: vec384_v24i16_to_v4i96_factor6: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-SLOW-LABEL: vec384_v24i16_to_v4i96_factor6: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq +; +; AVX512F-FAST-LABEL: vec384_v24i16_to_v4i96_factor6: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq +; +; AVX512BW-SLOW-LABEL: vec384_v24i16_to_v4i96_factor6: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,17,7,8,9,10,11,18,13,14,15] +; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-SLOW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 +; AVX512BW-SLOW-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: vec384_v24i16_to_v4i96_factor6: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,17,7,8,9,10,11,18,13,14,15] +; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-FAST-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> + %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16> + %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> + %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8> + %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec384_v24i16_to_v3i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec384_v24i16_to_v3i128_factor8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: paddb 16(%rdx), %xmm2 +; SSE-NEXT: paddb 32(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rcx) +; SSE-NEXT: movdqa %xmm0, 32(%rcx) +; SSE-NEXT: movdqa %xmm2, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec384_v24i16_to_v3i128_factor8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] +; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: retq +; +; AVX2-SLOW-LABEL: vec384_v24i16_to_v3i128_factor8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v3i128_factor8: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX2-FAST-LABEL: vec384_v24i16_to_v3i128_factor8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-SLOW-LABEL: vec384_v24i16_to_v3i128_factor8: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq +; +; AVX512F-FAST-LABEL: vec384_v24i16_to_v3i128_factor8: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX512F-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq +; +; AVX512BW-SLOW-LABEL: vec384_v24i16_to_v3i128_factor8: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-SLOW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: vec384_v24i16_to_v3i128_factor8: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15] +; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-FAST-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> + %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16> + %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> + %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8> + %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec384_v24i16_to_v2i192_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec384_v24i16_to_v2i192_factor12: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE-NEXT: movaps 32(%rdx), %xmm2 +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm1 +; SSE-NEXT: movaps %xmm2, 32(%rcx) +; SSE-NEXT: movdqa %xmm1, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec384_v24i16_to_v2i192_factor12: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] +; AVX-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: vec384_v24i16_to_v2i192_factor12: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec384_v24i16_to_v2i192_factor12: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec384_v24i16_to_v2i192_factor12: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,8,9,10,11,17,13,14,15] +; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> + %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16> + %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> + %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8> + %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec384_v24i16_to_v1i384_factor24(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec384_v24i16_to_v1i384_factor24: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: movaps 16(%rdx), %xmm1 +; SSE-NEXT: movaps 32(%rdx), %xmm2 +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movaps %xmm2, 32(%rcx) +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec384_v24i16_to_v1i384_factor24: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 16(%rdx), %xmm2 +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) +; AVX-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: vec384_v24i16_to_v1i384_factor24: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec384_v24i16_to_v1i384_factor24: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec384_v24i16_to_v1i384_factor24: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> + %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <24 x i16> + %zextd.vec = shufflevector <24 x i16> %in.vec.cast, <24 x i16> zeroinitializer, <24 x i32> + %out.bytevec = bitcast <24 x i16> %zextd.vec to <48 x i8> + %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec384_v12i32_to_v6i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec384_v12i32_to_v6i64_factor2: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: paddb 16(%rsi), %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm3 +; SSE-NEXT: paddb 32(%rdx), %xmm1 +; SSE-NEXT: movdqa %xmm1, 32(%rcx) +; SSE-NEXT: movdqa %xmm3, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec384_v12i32_to_v6i64_factor2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: vec384_v12i32_to_v6i64_factor2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec384_v12i32_to_v6i64_factor2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec384_v12i32_to_v6i64_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> + %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <12 x i32> + %zextd.vec = shufflevector <12 x i32> %in.vec.cast, <12 x i32> zeroinitializer, <12 x i32> + %out.bytevec = bitcast <12 x i32> %zextd.vec to <48 x i8> + %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec384_v12i32_to_v4i96_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec384_v12i32_to_v4i96_factor3: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,0,4294967295,0] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: paddb 32(%rdx), %xmm3 +; SSE-NEXT: paddb 16(%rdx), %xmm2 +; SSE-NEXT: movdqa %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm3, 32(%rcx) +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec384_v12i32_to_v4i96_factor3: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3] +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-SLOW-LABEL: vec384_v12i32_to_v4i96_factor3: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vec384_v12i32_to_v4i96_factor3: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX2-FAST-LABEL: vec384_v12i32_to_v4i96_factor3: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: vec384_v12i32_to_v4i96_factor3: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <16,1,2,17,4,5,18,7,8,19,10,11,u,u,u,u> +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-SLOW-LABEL: vec384_v12i32_to_v4i96_factor3: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: movb $73, %al +; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 +; AVX512BW-SLOW-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z} +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: vec384_v12i32_to_v4i96_factor3: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: movb $73, %al +; AVX512BW-FAST-NEXT: kmovd %eax, %k1 +; AVX512BW-FAST-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> + %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <12 x i32> + %zextd.vec = shufflevector <12 x i32> %in.vec.cast, <12 x i32> zeroinitializer, <12 x i32> + %out.bytevec = bitcast <12 x i32> %zextd.vec to <48 x i8> + %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec384_v12i32_to_v3i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec384_v12i32_to_v3i128_factor4: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: xorps %xmm2, %xmm2 +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb 32(%rdx), %xmm3 +; SSE-NEXT: paddb (%rdx), %xmm2 +; SSE-NEXT: movdqa %xmm2, (%rcx) +; SSE-NEXT: movdqa %xmm3, 32(%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec384_v12i32_to_v3i128_factor4: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-SLOW-LABEL: vec384_v12i32_to_v3i128_factor4: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX2-SLOW-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vec384_v12i32_to_v3i128_factor4: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX2-FAST-LABEL: vec384_v12i32_to_v3i128_factor4: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: vec384_v12i32_to_v3i128_factor4: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <16,1,2,3,17,5,6,7,18,9,10,11,u,u,u,u> +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-SLOW-LABEL: vec384_v12i32_to_v3i128_factor4: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: movb $17, %al +; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 +; AVX512BW-SLOW-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z} +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: vec384_v12i32_to_v3i128_factor4: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: movb $17, %al +; AVX512BW-FAST-NEXT: kmovd %eax, %k1 +; AVX512BW-FAST-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> + %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <12 x i32> + %zextd.vec = shufflevector <12 x i32> %in.vec.cast, <12 x i32> zeroinitializer, <12 x i32> + %out.bytevec = bitcast <12 x i32> %zextd.vec to <48 x i8> + %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec384_v12i32_to_v2i192_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec384_v12i32_to_v2i192_factor6: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: xorps %xmm2, %xmm2 +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movaps 32(%rdx), %xmm0 +; SSE-NEXT: paddb 16(%rdx), %xmm1 +; SSE-NEXT: paddb (%rdx), %xmm2 +; SSE-NEXT: movaps %xmm0, 32(%rcx) +; SSE-NEXT: movdqa %xmm2, (%rcx) +; SSE-NEXT: movdqa %xmm1, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec384_v12i32_to_v2i192_factor6: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5],ymm0[6],ymm1[7] +; AVX-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-SLOW-LABEL: vec384_v12i32_to_v2i192_factor6: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vec384_v12i32_to_v2i192_factor6: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX2-FAST-LABEL: vec384_v12i32_to_v2i192_factor6: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5],ymm0[6],ymm1[7] +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: vec384_v12i32_to_v2i192_factor6: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: movb $65, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpexpandd %ymm0, %ymm0 {%k1} {z} +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec384_v12i32_to_v2i192_factor6: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: movb $65, %al +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpexpandd %ymm0, %ymm0 {%k1} {z} +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> + %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <12 x i32> + %zextd.vec = shufflevector <12 x i32> %in.vec.cast, <12 x i32> zeroinitializer, <12 x i32> + %out.bytevec = bitcast <12 x i32> %zextd.vec to <48 x i8> + %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec384_v12i32_to_v1i384_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec384_v12i32_to_v1i384_factor12: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE-NEXT: movaps 16(%rdx), %xmm0 +; SSE-NEXT: movaps 32(%rdx), %xmm2 +; SSE-NEXT: paddb (%rdx), %xmm1 +; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps %xmm2, 32(%rcx) +; SSE-NEXT: movdqa %xmm1, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec384_v12i32_to_v1i384_factor12: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 16(%rdx), %xmm2 +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) +; AVX-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: vec384_v12i32_to_v1i384_factor12: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec384_v12i32_to_v1i384_factor12: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec384_v12i32_to_v1i384_factor12: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> + %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <12 x i32> + %zextd.vec = shufflevector <12 x i32> %in.vec.cast, <12 x i32> zeroinitializer, <12 x i32> + %out.bytevec = bitcast <12 x i32> %zextd.vec to <48 x i8> + %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec384_v6i64_to_v3i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec384_v6i64_to_v3i128_factor2: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: paddb 16(%rsi), %xmm1 +; SSE-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero +; SSE-NEXT: movq {{.*#+}} xmm2 = xmm0[0],zero +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm2 +; SSE-NEXT: paddb 32(%rdx), %xmm1 +; SSE-NEXT: movdqa %xmm1, 32(%rcx) +; SSE-NEXT: movdqa %xmm2, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec384_v6i64_to_v3i128_factor2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[3] +; AVX-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: vec384_v6i64_to_v3i128_factor2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec384_v6i64_to_v3i128_factor2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,9,1,11,2,13,u,u> +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec384_v6i64_to_v3i128_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: movb $5, %al +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpexpandq %ymm0, %ymm1 {%k1} {z} +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> + %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <6 x i64> + %zextd.vec = shufflevector <6 x i64> %in.vec.cast, <6 x i64> zeroinitializer, <6 x i32> + %out.bytevec = bitcast <6 x i64> %zextd.vec to <48 x i8> + %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec384_v6i64_to_v2i192_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec384_v6i64_to_v2i192_factor3: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; SSE-NEXT: movaps 32(%rdx), %xmm2 +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: paddb 16(%rdx), %xmm1 +; SSE-NEXT: movaps %xmm2, 32(%rcx) +; SSE-NEXT: movdqa %xmm1, 16(%rcx) +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec384_v6i64_to_v2i192_factor3: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = zero,zero,ymm0[0,1] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: vec384_v6i64_to_v2i192_factor3: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec384_v6i64_to_v2i192_factor3: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: movb $9, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpexpandq %ymm0, %ymm0 {%k1} {z} +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec384_v6i64_to_v2i192_factor3: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: movb $9, %al +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpexpandq %ymm0, %ymm0 {%k1} {z} +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> + %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <6 x i64> + %zextd.vec = shufflevector <6 x i64> %in.vec.cast, <6 x i64> zeroinitializer, <6 x i32> + %out.bytevec = bitcast <6 x i64> %zextd.vec to <48 x i8> + %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec384_v6i64_to_v1i384_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec384_v6i64_to_v1i384_factor6: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; SSE-NEXT: movaps 16(%rdx), %xmm1 +; SSE-NEXT: movaps 32(%rdx), %xmm2 +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movaps %xmm2, 32(%rcx) +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec384_v6i64_to_v1i384_factor6: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 16(%rdx), %xmm2 +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) +; AVX-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: vec384_v6i64_to_v1i384_factor6: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec384_v6i64_to_v1i384_factor6: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec384_v6i64_to_v1i384_factor6: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> + %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <6 x i64> + %zextd.vec = shufflevector <6 x i64> %in.vec.cast, <6 x i64> zeroinitializer, <6 x i32> + %out.bytevec = bitcast <6 x i64> %zextd.vec to <48 x i8> + %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec384_v3i128_to_v1i384_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec384_v3i128_to_v1i384_factor3: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movaps 16(%rdx), %xmm1 +; SSE-NEXT: movaps 32(%rdx), %xmm2 +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movaps %xmm2, 32(%rcx) +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec384_v3i128_to_v1i384_factor3: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: movq %rax, %rsi +; AVX-NEXT: movq %rax, %rdi +; AVX-NEXT: movq %rax, %r8 +; AVX-NEXT: movq %rax, %r9 +; AVX-NEXT: movl %eax, %r10d +; AVX-NEXT: movl %eax, %r11d +; AVX-NEXT: vmovd %eax, %xmm1 +; AVX-NEXT: shrl $8, %eax +; AVX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: shrl $16, %r11d +; AVX-NEXT: vpinsrb $2, %r11d, %xmm1, %xmm1 +; AVX-NEXT: shrl $24, %r10d +; AVX-NEXT: vpinsrb $3, %r10d, %xmm1, %xmm1 +; AVX-NEXT: shrq $32, %r9 +; AVX-NEXT: vpinsrb $4, %r9d, %xmm1, %xmm1 +; AVX-NEXT: shrq $40, %r8 +; AVX-NEXT: vpinsrb $5, %r8d, %xmm1, %xmm1 +; AVX-NEXT: shrq $48, %rdi +; AVX-NEXT: vpinsrb $6, %edi, %xmm1, %xmm1 +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: shrq $56, %rsi +; AVX-NEXT: vpinsrb $7, %esi, %xmm1, %xmm0 +; AVX-NEXT: movl %eax, %esi +; AVX-NEXT: shrl $8, %esi +; AVX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0 +; AVX-NEXT: movl %eax, %esi +; AVX-NEXT: shrl $16, %esi +; AVX-NEXT: vpinsrb $10, %esi, %xmm0, %xmm0 +; AVX-NEXT: movl %eax, %esi +; AVX-NEXT: shrl $24, %esi +; AVX-NEXT: vpinsrb $11, %esi, %xmm0, %xmm0 +; AVX-NEXT: movq %rax, %rsi +; AVX-NEXT: shrq $32, %rsi +; AVX-NEXT: vpinsrb $12, %esi, %xmm0, %xmm0 +; AVX-NEXT: movq %rax, %rsi +; AVX-NEXT: shrq $40, %rsi +; AVX-NEXT: vpinsrb $13, %esi, %xmm0, %xmm0 +; AVX-NEXT: movq %rax, %rsi +; AVX-NEXT: shrq $48, %rsi +; AVX-NEXT: vpinsrb $14, %esi, %xmm0, %xmm0 +; AVX-NEXT: shrq $56, %rax +; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 16(%rdx), %xmm2 +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) +; AVX-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: vec384_v3i128_to_v1i384_factor3: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: movq %rax, %rdi +; AVX2-NEXT: movq %rax, %r8 +; AVX2-NEXT: movq %rax, %r9 +; AVX2-NEXT: movl %eax, %r10d +; AVX2-NEXT: movl %eax, %r11d +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: shrl $8, %eax +; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX2-NEXT: shrl $16, %r11d +; AVX2-NEXT: vpinsrb $2, %r11d, %xmm1, %xmm1 +; AVX2-NEXT: shrl $24, %r10d +; AVX2-NEXT: vpinsrb $3, %r10d, %xmm1, %xmm1 +; AVX2-NEXT: shrq $32, %r9 +; AVX2-NEXT: vpinsrb $4, %r9d, %xmm1, %xmm1 +; AVX2-NEXT: shrq $40, %r8 +; AVX2-NEXT: vpinsrb $5, %r8d, %xmm1, %xmm1 +; AVX2-NEXT: shrq $48, %rdi +; AVX2-NEXT: vpinsrb $6, %edi, %xmm1, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: shrq $56, %rsi +; AVX2-NEXT: vpinsrb $7, %esi, %xmm1, %xmm0 +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: shrl $8, %esi +; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: shrl $16, %esi +; AVX2-NEXT: vpinsrb $10, %esi, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: shrl $24, %esi +; AVX2-NEXT: vpinsrb $11, %esi, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: shrq $32, %rsi +; AVX2-NEXT: vpinsrb $12, %esi, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: shrq $40, %rsi +; AVX2-NEXT: vpinsrb $13, %esi, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: shrq $48, %rsi +; AVX2-NEXT: vpinsrb $14, %esi, %xmm0, %xmm0 +; AVX2-NEXT: shrq $56, %rax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec384_v3i128_to_v1i384_factor3: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: movq %rax, %rsi +; AVX512F-NEXT: movq %rax, %rdi +; AVX512F-NEXT: movq %rax, %r8 +; AVX512F-NEXT: movq %rax, %r9 +; AVX512F-NEXT: movl %eax, %r10d +; AVX512F-NEXT: movl %eax, %r11d +; AVX512F-NEXT: vmovd %eax, %xmm1 +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: shrl $16, %r11d +; AVX512F-NEXT: vpinsrb $2, %r11d, %xmm1, %xmm1 +; AVX512F-NEXT: shrl $24, %r10d +; AVX512F-NEXT: vpinsrb $3, %r10d, %xmm1, %xmm1 +; AVX512F-NEXT: shrq $32, %r9 +; AVX512F-NEXT: vpinsrb $4, %r9d, %xmm1, %xmm1 +; AVX512F-NEXT: shrq $40, %r8 +; AVX512F-NEXT: vpinsrb $5, %r8d, %xmm1, %xmm1 +; AVX512F-NEXT: shrq $48, %rdi +; AVX512F-NEXT: vpinsrb $6, %edi, %xmm1, %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: shrq $56, %rsi +; AVX512F-NEXT: vpinsrb $7, %esi, %xmm1, %xmm0 +; AVX512F-NEXT: movl %eax, %esi +; AVX512F-NEXT: shrl $8, %esi +; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0 +; AVX512F-NEXT: movl %eax, %esi +; AVX512F-NEXT: shrl $16, %esi +; AVX512F-NEXT: vpinsrb $10, %esi, %xmm0, %xmm0 +; AVX512F-NEXT: movl %eax, %esi +; AVX512F-NEXT: shrl $24, %esi +; AVX512F-NEXT: vpinsrb $11, %esi, %xmm0, %xmm0 +; AVX512F-NEXT: movq %rax, %rsi +; AVX512F-NEXT: shrq $32, %rsi +; AVX512F-NEXT: vpinsrb $12, %esi, %xmm0, %xmm0 +; AVX512F-NEXT: movq %rax, %rsi +; AVX512F-NEXT: shrq $40, %rsi +; AVX512F-NEXT: vpinsrb $13, %esi, %xmm0, %xmm0 +; AVX512F-NEXT: movq %rax, %rsi +; AVX512F-NEXT: shrq $48, %rsi +; AVX512F-NEXT: vpinsrb $14, %esi, %xmm0, %xmm0 +; AVX512F-NEXT: shrq $56, %rax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec384_v3i128_to_v1i384_factor3: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: movq %rax, %rsi +; AVX512BW-NEXT: movq %rax, %rdi +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: movq %rax, %r9 +; AVX512BW-NEXT: movl %eax, %r10d +; AVX512BW-NEXT: movl %eax, %r11d +; AVX512BW-NEXT: vmovd %eax, %xmm1 +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: shrl $16, %r11d +; AVX512BW-NEXT: vpinsrb $2, %r11d, %xmm1, %xmm1 +; AVX512BW-NEXT: shrl $24, %r10d +; AVX512BW-NEXT: vpinsrb $3, %r10d, %xmm1, %xmm1 +; AVX512BW-NEXT: shrq $32, %r9 +; AVX512BW-NEXT: vpinsrb $4, %r9d, %xmm1, %xmm1 +; AVX512BW-NEXT: shrq $40, %r8 +; AVX512BW-NEXT: vpinsrb $5, %r8d, %xmm1, %xmm1 +; AVX512BW-NEXT: shrq $48, %rdi +; AVX512BW-NEXT: vpinsrb $6, %edi, %xmm1, %xmm1 +; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax +; AVX512BW-NEXT: shrq $56, %rsi +; AVX512BW-NEXT: vpinsrb $7, %esi, %xmm1, %xmm0 +; AVX512BW-NEXT: movl %eax, %esi +; AVX512BW-NEXT: shrl $8, %esi +; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0 +; AVX512BW-NEXT: movl %eax, %esi +; AVX512BW-NEXT: shrl $16, %esi +; AVX512BW-NEXT: vpinsrb $10, %esi, %xmm0, %xmm0 +; AVX512BW-NEXT: movl %eax, %esi +; AVX512BW-NEXT: shrl $24, %esi +; AVX512BW-NEXT: vpinsrb $11, %esi, %xmm0, %xmm0 +; AVX512BW-NEXT: movq %rax, %rsi +; AVX512BW-NEXT: shrq $32, %rsi +; AVX512BW-NEXT: vpinsrb $12, %esi, %xmm0, %xmm0 +; AVX512BW-NEXT: movq %rax, %rsi +; AVX512BW-NEXT: shrq $40, %rsi +; AVX512BW-NEXT: vpinsrb $13, %esi, %xmm0, %xmm0 +; AVX512BW-NEXT: movq %rax, %rsi +; AVX512BW-NEXT: shrq $48, %rsi +; AVX512BW-NEXT: vpinsrb $14, %esi, %xmm0, %xmm0 +; AVX512BW-NEXT: shrq $56, %rax +; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.trunc = shufflevector <64 x i8> %in.vec, <64 x i8> poison, <48 x i32> + %in.vec.cast = bitcast <48 x i8> %in.vec.trunc to <3 x i128> + %zextd.vec = shufflevector <3 x i128> %in.vec.cast, <3 x i128> zeroinitializer, <3 x i32> + %out.bytevec = bitcast <3 x i128> %zextd.vec to <48 x i8> + %out.bytevec.padded = shufflevector <48 x i8> %out.bytevec, <48 x i8> poison, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec.padded, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec512_v64i8_to_v32i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec512_v64i8_to_v32i16_factor2: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: paddb 16(%rsi), %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm4 +; SSE-NEXT: paddb 48(%rdx), %xmm1 +; SSE-NEXT: paddb 32(%rdx), %xmm3 +; SSE-NEXT: movdqa %xmm3, 32(%rcx) +; SSE-NEXT: movdqa %xmm1, 48(%rcx) +; SSE-NEXT: movdqa %xmm4, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec512_v64i8_to_v32i16_factor2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec512_v64i8_to_v32i16_factor2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec512_v64i8_to_v32i16_factor2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec512_v64i8_to_v32i16_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %zextd.vec = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %zextd.vec, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec512_v64i8_to_v16i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec512_v64i8_to_v16i32_factor4: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm4 +; SSE-NEXT: paddb 48(%rdx), %xmm2 +; SSE-NEXT: paddb 32(%rdx), %xmm3 +; SSE-NEXT: movdqa %xmm3, 32(%rcx) +; SSE-NEXT: movdqa %xmm2, 48(%rcx) +; SSE-NEXT: movdqa %xmm4, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec512_v64i8_to_v16i32_factor4: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec512_v64i8_to_v16i32_factor4: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec512_v64i8_to_v16i32_factor4: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec512_v64i8_to_v16i32_factor4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %zextd.vec = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %zextd.vec, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec512_v64i8_to_v8i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec512_v64i8_to_v8i64_factor8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm4 +; SSE-NEXT: paddb 48(%rdx), %xmm2 +; SSE-NEXT: paddb 32(%rdx), %xmm3 +; SSE-NEXT: movdqa %xmm3, 32(%rcx) +; SSE-NEXT: movdqa %xmm2, 48(%rcx) +; SSE-NEXT: movdqa %xmm4, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec512_v64i8_to_v8i64_factor8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; AVX-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec512_v64i8_to_v8i64_factor8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec512_v64i8_to_v8i64_factor8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec512_v64i8_to_v8i64_factor8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %zextd.vec = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %zextd.vec, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec512_v64i8_to_v4i128_factor16: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb 48(%rdx), %xmm3 +; SSE-NEXT: paddb 32(%rdx), %xmm2 +; SSE-NEXT: paddb (%rdx), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rcx) +; SSE-NEXT: movdqa %xmm2, 32(%rcx) +; SSE-NEXT: movdqa %xmm3, 48(%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec512_v64i8_to_v4i128_factor16: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] +; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: retq +; +; AVX2-SLOW-LABEL: vec512_v64i8_to_v4i128_factor16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vec512_v64i8_to_v4i128_factor16: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX2-FAST-LABEL: vec512_v64i8_to_v4i128_factor16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-SLOW-LABEL: vec512_v64i8_to_v4i128_factor16: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq +; +; AVX512F-FAST-LABEL: vec512_v64i8_to_v4i128_factor16: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq +; +; AVX512BW-SLOW-LABEL: vec512_v64i8_to_v4i128_factor16: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-SLOW-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7] +; AVX512BW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: vec512_v64i8_to_v4i128_factor16: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7] +; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %zextd.vec = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %zextd.vec, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec512_v64i8_to_v2i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec512_v64i8_to_v2i256_factor32: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: movaps 16(%rdx), %xmm2 +; SSE-NEXT: movaps 48(%rdx), %xmm3 +; SSE-NEXT: paddb 32(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm1 +; SSE-NEXT: movaps %xmm3, 48(%rcx) +; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm1, (%rcx) +; SSE-NEXT: movdqa %xmm0, 32(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec512_v64i8_to_v2i256_factor32: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 16(%rdx), %xmm2 +; AVX-NEXT: vmovaps 48(%rdx), %xmm3 +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) +; AVX-NEXT: vmovaps %xmm3, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec512_v64i8_to_v2i256_factor32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec512_v64i8_to_v2i256_factor32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm1 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec512_v64i8_to_v2i256_factor32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %zextd.vec = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %zextd.vec, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec512_v64i8_to_v1i512_factor64(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec512_v64i8_to_v1i512_factor64: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: movaps 16(%rdx), %xmm1 +; SSE-NEXT: movaps 32(%rdx), %xmm2 +; SSE-NEXT: movaps 48(%rdx), %xmm3 +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movaps %xmm2, 32(%rcx) +; SSE-NEXT: movaps %xmm3, 48(%rcx) +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec512_v64i8_to_v1i512_factor64: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 16(%rdx), %xmm2 +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) +; AVX-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: vec512_v64i8_to_v1i512_factor64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec512_v64i8_to_v1i512_factor64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec512_v64i8_to_v1i512_factor64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %zextd.vec = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <64 x i32> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %zextd.vec, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec512_v32i16_to_v16i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec512_v32i16_to_v16i32_factor2: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: paddb 16(%rsi), %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm4 +; SSE-NEXT: paddb 48(%rdx), %xmm1 +; SSE-NEXT: paddb 32(%rdx), %xmm3 +; SSE-NEXT: movdqa %xmm3, 32(%rcx) +; SSE-NEXT: movdqa %xmm1, 48(%rcx) +; SSE-NEXT: movdqa %xmm4, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec512_v32i16_to_v16i32_factor2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec512_v32i16_to_v16i32_factor2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec512_v32i16_to_v16i32_factor2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec512_v32i16_to_v16i32_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> + %zextd.vec = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> + %out.bytevec = bitcast <32 x i16> %zextd.vec to <64 x i8> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec512_v32i16_to_v8i64_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec512_v32i16_to_v8i64_factor4: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm4 +; SSE-NEXT: paddb 48(%rdx), %xmm2 +; SSE-NEXT: paddb 32(%rdx), %xmm3 +; SSE-NEXT: movdqa %xmm3, 32(%rcx) +; SSE-NEXT: movdqa %xmm2, 48(%rcx) +; SSE-NEXT: movdqa %xmm4, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec512_v32i16_to_v8i64_factor4: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec512_v32i16_to_v8i64_factor4: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec512_v32i16_to_v8i64_factor4: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec512_v32i16_to_v8i64_factor4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> + %zextd.vec = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> + %out.bytevec = bitcast <32 x i16> %zextd.vec to <64 x i8> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec512_v32i16_to_v4i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec512_v32i16_to_v4i128_factor8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: paddb 16(%rdx), %xmm3 +; SSE-NEXT: paddb 48(%rdx), %xmm2 +; SSE-NEXT: paddb 32(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rcx) +; SSE-NEXT: movdqa %xmm0, 32(%rcx) +; SSE-NEXT: movdqa %xmm2, 48(%rcx) +; SSE-NEXT: movdqa %xmm3, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec512_v32i16_to_v4i128_factor8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] +; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: retq +; +; AVX2-SLOW-LABEL: vec512_v32i16_to_v4i128_factor8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vec512_v32i16_to_v4i128_factor8: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX2-FAST-LABEL: vec512_v32i16_to_v4i128_factor8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-SLOW-LABEL: vec512_v32i16_to_v4i128_factor8: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq +; +; AVX512F-FAST-LABEL: vec512_v32i16_to_v4i128_factor8: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX512F-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq +; +; AVX512BW-LABEL: vec512_v32i16_to_v4i128_factor8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [32,1,2,3,4,5,6,7,33,9,10,11,12,13,14,15,34,17,18,19,20,21,22,23,35,25,26,27,28,29,30,31] +; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> + %zextd.vec = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> + %out.bytevec = bitcast <32 x i16> %zextd.vec to <64 x i8> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec512_v32i16_to_v2i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec512_v32i16_to_v2i256_factor16: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: movaps 16(%rdx), %xmm2 +; SSE-NEXT: movaps 48(%rdx), %xmm3 +; SSE-NEXT: paddb 32(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm1 +; SSE-NEXT: movaps %xmm3, 48(%rcx) +; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm1, (%rcx) +; SSE-NEXT: movdqa %xmm0, 32(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec512_v32i16_to_v2i256_factor16: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 16(%rdx), %xmm2 +; AVX-NEXT: vmovaps 48(%rdx), %xmm3 +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) +; AVX-NEXT: vmovaps %xmm3, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec512_v32i16_to_v2i256_factor16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm1 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec512_v32i16_to_v2i256_factor16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec512_v32i16_to_v2i256_factor16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX512BW-NEXT: vpand %ymm0, %ymm1, %ymm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> + %zextd.vec = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> + %out.bytevec = bitcast <32 x i16> %zextd.vec to <64 x i8> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec512_v32i16_to_v1i512_factor32(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec512_v32i16_to_v1i512_factor32: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: movaps 16(%rdx), %xmm1 +; SSE-NEXT: movaps 32(%rdx), %xmm2 +; SSE-NEXT: movaps 48(%rdx), %xmm3 +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movaps %xmm2, 32(%rcx) +; SSE-NEXT: movaps %xmm3, 48(%rcx) +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec512_v32i16_to_v1i512_factor32: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 16(%rdx), %xmm2 +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) +; AVX-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: vec512_v32i16_to_v1i512_factor32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec512_v32i16_to_v1i512_factor32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec512_v32i16_to_v1i512_factor32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> + %zextd.vec = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <32 x i32> + %out.bytevec = bitcast <32 x i16> %zextd.vec to <64 x i8> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec512_v16i32_to_v8i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec512_v16i32_to_v8i64_factor2: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: paddb 16(%rsi), %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm4 +; SSE-NEXT: paddb 48(%rdx), %xmm1 +; SSE-NEXT: paddb 32(%rdx), %xmm3 +; SSE-NEXT: movdqa %xmm3, 32(%rcx) +; SSE-NEXT: movdqa %xmm1, 48(%rcx) +; SSE-NEXT: movdqa %xmm4, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec512_v16i32_to_v8i64_factor2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: vec512_v16i32_to_v8i64_factor2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec512_v16i32_to_v8i64_factor2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec512_v16i32_to_v8i64_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> + %zextd.vec = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <16 x i32> + %out.bytevec = bitcast <16 x i32> %zextd.vec to <64 x i8> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec512_v16i32_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec512_v16i32_to_v4i128_factor4: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: xorps %xmm3, %xmm3 +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb 32(%rdx), %xmm4 +; SSE-NEXT: paddb (%rdx), %xmm3 +; SSE-NEXT: paddb 48(%rdx), %xmm2 +; SSE-NEXT: movdqa %xmm2, 48(%rcx) +; SSE-NEXT: movdqa %xmm3, (%rcx) +; SSE-NEXT: movdqa %xmm4, 32(%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec512_v16i32_to_v4i128_factor4: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3 +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-SLOW-LABEL: vec512_v16i32_to_v4i128_factor4: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vec512_v16i32_to_v4i128_factor4: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX2-FAST-LABEL: vec512_v16i32_to_v4i128_factor4: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,u,u,u,3,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: vec512_v16i32_to_v4i128_factor4: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: movw $4369, %ax # imm = 0x1111 +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec512_v16i32_to_v4i128_factor4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: movb $17, %al +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z} +; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [2,9,10,11,3,13,14,15] +; AVX512BW-NEXT: vpermi2d %ymm2, %ymm0, %ymm3 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> + %zextd.vec = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <16 x i32> + %out.bytevec = bitcast <16 x i32> %zextd.vec to <64 x i8> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec512_v16i32_to_v2i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec512_v16i32_to_v2i256_factor8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: xorps %xmm2, %xmm2 +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] +; SSE-NEXT: movaps 16(%rdx), %xmm1 +; SSE-NEXT: movaps 48(%rdx), %xmm3 +; SSE-NEXT: paddb 32(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm2 +; SSE-NEXT: movaps %xmm3, 48(%rcx) +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movdqa %xmm2, (%rcx) +; SSE-NEXT: movdqa %xmm0, 32(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec512_v16i32_to_v2i256_factor8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 16(%rdx), %xmm2 +; AVX-NEXT: vmovaps 48(%rdx), %xmm3 +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) +; AVX-NEXT: vmovaps %xmm3, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: retq +; +; AVX2-SLOW-LABEL: vec512_v16i32_to_v2i256_factor8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vec512_v16i32_to_v2i256_factor8: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX2-FAST-LABEL: vec512_v16i32_to_v2i256_factor8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-LABEL: vec512_v16i32_to_v2i256_factor8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: movw $257, %ax # imm = 0x101 +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-SLOW-LABEL: vec512_v16i32_to_v2i256_factor8: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: vec512_v16i32_to_v2i256_factor8: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> + %zextd.vec = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <16 x i32> + %out.bytevec = bitcast <16 x i32> %zextd.vec to <64 x i8> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec512_v16i32_to_v1i512_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec512_v16i32_to_v1i512_factor16: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE-NEXT: movaps 16(%rdx), %xmm0 +; SSE-NEXT: movaps 32(%rdx), %xmm2 +; SSE-NEXT: movaps 48(%rdx), %xmm3 +; SSE-NEXT: paddb (%rdx), %xmm1 +; SSE-NEXT: movaps %xmm2, 32(%rcx) +; SSE-NEXT: movaps %xmm3, 48(%rcx) +; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movdqa %xmm1, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec512_v16i32_to_v1i512_factor16: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 16(%rdx), %xmm2 +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) +; AVX-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: vec512_v16i32_to_v1i512_factor16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec512_v16i32_to_v1i512_factor16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec512_v16i32_to_v1i512_factor16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.cast = bitcast <64 x i8> %in.vec to <16 x i32> + %zextd.vec = shufflevector <16 x i32> %in.vec.cast, <16 x i32> zeroinitializer, <16 x i32> + %out.bytevec = bitcast <16 x i32> %zextd.vec to <64 x i8> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec512_v8i64_to_v4i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec512_v8i64_to_v4i128_factor2: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: paddb 16(%rsi), %xmm1 +; SSE-NEXT: movq {{.*#+}} xmm2 = xmm1[0],zero +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: movq {{.*#+}} xmm3 = xmm0[0],zero +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm3 +; SSE-NEXT: paddb 48(%rdx), %xmm1 +; SSE-NEXT: paddb 32(%rdx), %xmm2 +; SSE-NEXT: movdqa %xmm2, 32(%rcx) +; SSE-NEXT: movdqa %xmm1, 48(%rcx) +; SSE-NEXT: movdqa %xmm3, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec512_v8i64_to_v4i128_factor2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[3] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[3] +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: vec512_v8i64_to_v4i128_factor2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec512_v8i64_to_v4i128_factor2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: movb $85, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-SLOW-LABEL: vec512_v8i64_to_v4i128_factor2: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: movb $5, %al +; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 +; AVX512BW-SLOW-NEXT: vpexpandq %ymm0, %ymm1 {%k1} {z} +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: vec512_v8i64_to_v4i128_factor2: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: movb $5, %al +; AVX512BW-FAST-NEXT: kmovd %eax, %k1 +; AVX512BW-FAST-NEXT: vpexpandq %ymm0, %ymm1 {%k1} {z} +; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,5,3,7] +; AVX512BW-FAST-NEXT: vpermi2q %ymm2, %ymm0, %ymm3 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64> + %zextd.vec = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <8 x i32> + %out.bytevec = bitcast <8 x i64> %zextd.vec to <64 x i8> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec512_v8i64_to_v2i256_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec512_v8i64_to_v2i256_factor4: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: movaps 16(%rdx), %xmm2 +; SSE-NEXT: movaps 48(%rdx), %xmm3 +; SSE-NEXT: paddb (%rdx), %xmm1 +; SSE-NEXT: paddb 32(%rdx), %xmm0 +; SSE-NEXT: movaps %xmm3, 48(%rcx) +; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm0, 32(%rcx) +; SSE-NEXT: movdqa %xmm1, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec512_v8i64_to_v2i256_factor4: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 16(%rdx), %xmm2 +; AVX-NEXT: vmovaps 48(%rdx), %xmm3 +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) +; AVX-NEXT: vmovaps %xmm3, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec512_v8i64_to_v2i256_factor4: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = xmm0[0],zero +; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec512_v8i64_to_v2i256_factor4: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: movb $17, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec512_v8i64_to_v2i256_factor4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = xmm0[0],zero +; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64> + %zextd.vec = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <8 x i32> + %out.bytevec = bitcast <8 x i64> %zextd.vec to <64 x i8> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec512_v8i64_to_v1i512_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec512_v8i64_to_v1i512_factor8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; SSE-NEXT: movaps 16(%rdx), %xmm1 +; SSE-NEXT: movaps 32(%rdx), %xmm2 +; SSE-NEXT: movaps 48(%rdx), %xmm3 +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movaps %xmm2, 32(%rcx) +; SSE-NEXT: movaps %xmm3, 48(%rcx) +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec512_v8i64_to_v1i512_factor8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 16(%rdx), %xmm2 +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) +; AVX-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: vec512_v8i64_to_v1i512_factor8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec512_v8i64_to_v1i512_factor8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec512_v8i64_to_v1i512_factor8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64> + %zextd.vec = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <8 x i32> + %out.bytevec = bitcast <8 x i64> %zextd.vec to <64 x i8> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec512_v4i128_to_v2i256_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec512_v4i128_to_v2i256_factor2: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: paddb 16(%rsi), %xmm1 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movaps 16(%rdx), %xmm2 +; SSE-NEXT: movaps 48(%rdx), %xmm3 +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: paddb 32(%rdx), %xmm1 +; SSE-NEXT: movaps %xmm3, 48(%rcx) +; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm1, 32(%rcx) +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec512_v4i128_to_v2i256_factor2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovq %xmm1, %rax +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: movq %rax, %rsi +; AVX-NEXT: movq %rax, %rdi +; AVX-NEXT: movq %rax, %r8 +; AVX-NEXT: movq %rax, %r9 +; AVX-NEXT: movl %eax, %r10d +; AVX-NEXT: movl %eax, %r11d +; AVX-NEXT: vmovd %eax, %xmm2 +; AVX-NEXT: shrl $8, %eax +; AVX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; AVX-NEXT: shrl $16, %r11d +; AVX-NEXT: vpinsrb $2, %r11d, %xmm2, %xmm2 +; AVX-NEXT: shrl $24, %r10d +; AVX-NEXT: vpinsrb $3, %r10d, %xmm2, %xmm2 +; AVX-NEXT: shrq $32, %r9 +; AVX-NEXT: vpinsrb $4, %r9d, %xmm2, %xmm2 +; AVX-NEXT: shrq $40, %r8 +; AVX-NEXT: vpinsrb $5, %r8d, %xmm2, %xmm2 +; AVX-NEXT: shrq $48, %rdi +; AVX-NEXT: vpinsrb $6, %edi, %xmm2, %xmm2 +; AVX-NEXT: vpextrq $1, %xmm1, %rax +; AVX-NEXT: shrq $56, %rsi +; AVX-NEXT: vpinsrb $7, %esi, %xmm2, %xmm1 +; AVX-NEXT: movl %eax, %esi +; AVX-NEXT: shrl $8, %esi +; AVX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $9, %esi, %xmm1, %xmm1 +; AVX-NEXT: movl %eax, %esi +; AVX-NEXT: shrl $16, %esi +; AVX-NEXT: vpinsrb $10, %esi, %xmm1, %xmm1 +; AVX-NEXT: movl %eax, %esi +; AVX-NEXT: shrl $24, %esi +; AVX-NEXT: vpinsrb $11, %esi, %xmm1, %xmm1 +; AVX-NEXT: movq %rax, %rsi +; AVX-NEXT: shrq $32, %rsi +; AVX-NEXT: vpinsrb $12, %esi, %xmm1, %xmm1 +; AVX-NEXT: movq %rax, %rsi +; AVX-NEXT: shrq $40, %rsi +; AVX-NEXT: vpinsrb $13, %esi, %xmm1, %xmm1 +; AVX-NEXT: movq %rax, %rsi +; AVX-NEXT: shrq $48, %rsi +; AVX-NEXT: vpinsrb $14, %esi, %xmm1, %xmm1 +; AVX-NEXT: vmovq %xmm0, %rsi +; AVX-NEXT: shrq $56, %rax +; AVX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX-NEXT: movl %esi, %eax +; AVX-NEXT: shrl $8, %eax +; AVX-NEXT: vmovd %esi, %xmm2 +; AVX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; AVX-NEXT: movl %esi, %eax +; AVX-NEXT: shrl $16, %eax +; AVX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; AVX-NEXT: movl %esi, %eax +; AVX-NEXT: shrl $24, %eax +; AVX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX-NEXT: movq %rsi, %rax +; AVX-NEXT: shrq $32, %rax +; AVX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX-NEXT: movq %rsi, %rax +; AVX-NEXT: shrq $40, %rax +; AVX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX-NEXT: movq %rsi, %rax +; AVX-NEXT: shrq $48, %rax +; AVX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: shrq $56, %rsi +; AVX-NEXT: vpinsrb $7, %esi, %xmm2, %xmm0 +; AVX-NEXT: movl %eax, %esi +; AVX-NEXT: shrl $8, %esi +; AVX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0 +; AVX-NEXT: movl %eax, %esi +; AVX-NEXT: shrl $16, %esi +; AVX-NEXT: vpinsrb $10, %esi, %xmm0, %xmm0 +; AVX-NEXT: movl %eax, %esi +; AVX-NEXT: shrl $24, %esi +; AVX-NEXT: vpinsrb $11, %esi, %xmm0, %xmm0 +; AVX-NEXT: movq %rax, %rsi +; AVX-NEXT: shrq $32, %rsi +; AVX-NEXT: vpinsrb $12, %esi, %xmm0, %xmm0 +; AVX-NEXT: movq %rax, %rsi +; AVX-NEXT: shrq $40, %rsi +; AVX-NEXT: vpinsrb $13, %esi, %xmm0, %xmm0 +; AVX-NEXT: movq %rax, %rsi +; AVX-NEXT: shrq $48, %rsi +; AVX-NEXT: vpinsrb $14, %esi, %xmm0, %xmm0 +; AVX-NEXT: shrq $56, %rax +; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 16(%rdx), %xmm2 +; AVX-NEXT: vmovaps 48(%rdx), %xmm3 +; AVX-NEXT: vmovaps %xmm3, 48(%rcx) +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: retq +; +; AVX2-LABEL: vec512_v4i128_to_v2i256_factor2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: movq %rax, %rdi +; AVX2-NEXT: movq %rax, %r8 +; AVX2-NEXT: movq %rax, %r9 +; AVX2-NEXT: movl %eax, %r10d +; AVX2-NEXT: movl %eax, %r11d +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: shrl $8, %eax +; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX2-NEXT: shrl $16, %r11d +; AVX2-NEXT: vpinsrb $2, %r11d, %xmm1, %xmm1 +; AVX2-NEXT: shrl $24, %r10d +; AVX2-NEXT: vpinsrb $3, %r10d, %xmm1, %xmm1 +; AVX2-NEXT: shrq $32, %r9 +; AVX2-NEXT: vpinsrb $4, %r9d, %xmm1, %xmm1 +; AVX2-NEXT: shrq $40, %r8 +; AVX2-NEXT: vpinsrb $5, %r8d, %xmm1, %xmm1 +; AVX2-NEXT: shrq $48, %rdi +; AVX2-NEXT: vpinsrb $6, %edi, %xmm1, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: shrq $56, %rsi +; AVX2-NEXT: vpinsrb $7, %esi, %xmm1, %xmm1 +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: shrl $8, %esi +; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $9, %esi, %xmm1, %xmm1 +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: shrl $16, %esi +; AVX2-NEXT: vpinsrb $10, %esi, %xmm1, %xmm1 +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: shrl $24, %esi +; AVX2-NEXT: vpinsrb $11, %esi, %xmm1, %xmm1 +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: shrq $32, %rsi +; AVX2-NEXT: vpinsrb $12, %esi, %xmm1, %xmm1 +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: shrq $40, %rsi +; AVX2-NEXT: vpinsrb $13, %esi, %xmm1, %xmm1 +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: shrq $48, %rsi +; AVX2-NEXT: vpinsrb $14, %esi, %xmm1, %xmm1 +; AVX2-NEXT: vmovq %xmm0, %rsi +; AVX2-NEXT: shrq $56, %rax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %esi, %eax +; AVX2-NEXT: shrl $8, %eax +; AVX2-NEXT: vmovd %esi, %xmm2 +; AVX2-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movl %esi, %eax +; AVX2-NEXT: shrl $16, %eax +; AVX2-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movl %esi, %eax +; AVX2-NEXT: shrl $24, %eax +; AVX2-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movq %rsi, %rax +; AVX2-NEXT: shrq $32, %rax +; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movq %rsi, %rax +; AVX2-NEXT: shrq $40, %rax +; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movq %rsi, %rax +; AVX2-NEXT: shrq $48, %rax +; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: shrq $56, %rsi +; AVX2-NEXT: vpinsrb $7, %esi, %xmm2, %xmm0 +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: shrl $8, %esi +; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: shrl $16, %esi +; AVX2-NEXT: vpinsrb $10, %esi, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: shrl $24, %esi +; AVX2-NEXT: vpinsrb $11, %esi, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: shrq $32, %rsi +; AVX2-NEXT: vpinsrb $12, %esi, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: shrq $40, %rsi +; AVX2-NEXT: vpinsrb $13, %esi, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: shrq $48, %rsi +; AVX2-NEXT: vpinsrb $14, %esi, %xmm0, %xmm0 +; AVX2-NEXT: shrq $56, %rax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec512_v4i128_to_v2i256_factor2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: movb $51, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec512_v4i128_to_v2i256_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa %xmm0, %xmm1 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.cast = bitcast <64 x i8> %in.vec to <4 x i128> + %zextd.vec = shufflevector <4 x i128> %in.vec.cast, <4 x i128> zeroinitializer, <4 x i32> + %out.bytevec = bitcast <4 x i128> %zextd.vec to <64 x i8> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec512_v4i128_to_v1i512_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec512_v4i128_to_v1i512_factor4: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movaps 16(%rdx), %xmm1 +; SSE-NEXT: movaps 32(%rdx), %xmm2 +; SSE-NEXT: movaps 48(%rdx), %xmm3 +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movaps %xmm2, 32(%rcx) +; SSE-NEXT: movaps %xmm3, 48(%rcx) +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec512_v4i128_to_v1i512_factor4: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: movq %rax, %rsi +; AVX-NEXT: movq %rax, %rdi +; AVX-NEXT: movq %rax, %r8 +; AVX-NEXT: movq %rax, %r9 +; AVX-NEXT: movl %eax, %r10d +; AVX-NEXT: movl %eax, %r11d +; AVX-NEXT: vmovd %eax, %xmm1 +; AVX-NEXT: shrl $8, %eax +; AVX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: shrl $16, %r11d +; AVX-NEXT: vpinsrb $2, %r11d, %xmm1, %xmm1 +; AVX-NEXT: shrl $24, %r10d +; AVX-NEXT: vpinsrb $3, %r10d, %xmm1, %xmm1 +; AVX-NEXT: shrq $32, %r9 +; AVX-NEXT: vpinsrb $4, %r9d, %xmm1, %xmm1 +; AVX-NEXT: shrq $40, %r8 +; AVX-NEXT: vpinsrb $5, %r8d, %xmm1, %xmm1 +; AVX-NEXT: shrq $48, %rdi +; AVX-NEXT: vpinsrb $6, %edi, %xmm1, %xmm1 +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: shrq $56, %rsi +; AVX-NEXT: vpinsrb $7, %esi, %xmm1, %xmm0 +; AVX-NEXT: movl %eax, %esi +; AVX-NEXT: shrl $8, %esi +; AVX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0 +; AVX-NEXT: movl %eax, %esi +; AVX-NEXT: shrl $16, %esi +; AVX-NEXT: vpinsrb $10, %esi, %xmm0, %xmm0 +; AVX-NEXT: movl %eax, %esi +; AVX-NEXT: shrl $24, %esi +; AVX-NEXT: vpinsrb $11, %esi, %xmm0, %xmm0 +; AVX-NEXT: movq %rax, %rsi +; AVX-NEXT: shrq $32, %rsi +; AVX-NEXT: vpinsrb $12, %esi, %xmm0, %xmm0 +; AVX-NEXT: movq %rax, %rsi +; AVX-NEXT: shrq $40, %rsi +; AVX-NEXT: vpinsrb $13, %esi, %xmm0, %xmm0 +; AVX-NEXT: movq %rax, %rsi +; AVX-NEXT: shrq $48, %rsi +; AVX-NEXT: vpinsrb $14, %esi, %xmm0, %xmm0 +; AVX-NEXT: shrq $56, %rax +; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 16(%rdx), %xmm2 +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) +; AVX-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: vec512_v4i128_to_v1i512_factor4: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: movq %rax, %rdi +; AVX2-NEXT: movq %rax, %r8 +; AVX2-NEXT: movq %rax, %r9 +; AVX2-NEXT: movl %eax, %r10d +; AVX2-NEXT: movl %eax, %r11d +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: shrl $8, %eax +; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX2-NEXT: shrl $16, %r11d +; AVX2-NEXT: vpinsrb $2, %r11d, %xmm1, %xmm1 +; AVX2-NEXT: shrl $24, %r10d +; AVX2-NEXT: vpinsrb $3, %r10d, %xmm1, %xmm1 +; AVX2-NEXT: shrq $32, %r9 +; AVX2-NEXT: vpinsrb $4, %r9d, %xmm1, %xmm1 +; AVX2-NEXT: shrq $40, %r8 +; AVX2-NEXT: vpinsrb $5, %r8d, %xmm1, %xmm1 +; AVX2-NEXT: shrq $48, %rdi +; AVX2-NEXT: vpinsrb $6, %edi, %xmm1, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: shrq $56, %rsi +; AVX2-NEXT: vpinsrb $7, %esi, %xmm1, %xmm0 +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: shrl $8, %esi +; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: shrl $16, %esi +; AVX2-NEXT: vpinsrb $10, %esi, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: shrl $24, %esi +; AVX2-NEXT: vpinsrb $11, %esi, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: shrq $32, %rsi +; AVX2-NEXT: vpinsrb $12, %esi, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: shrq $40, %rsi +; AVX2-NEXT: vpinsrb $13, %esi, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: shrq $48, %rsi +; AVX2-NEXT: vpinsrb $14, %esi, %xmm0, %xmm0 +; AVX2-NEXT: shrq $56, %rax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec512_v4i128_to_v1i512_factor4: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec512_v4i128_to_v1i512_factor4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.cast = bitcast <64 x i8> %in.vec to <4 x i128> + %zextd.vec = shufflevector <4 x i128> %in.vec.cast, <4 x i128> zeroinitializer, <4 x i32> + %out.bytevec = bitcast <4 x i128> %zextd.vec to <64 x i8> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} + +define void @vec512_v2i256_to_v1i512_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { +; SSE-LABEL: vec512_v2i256_to_v1i512_factor2: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: paddb 16(%rsi), %xmm1 +; SSE-NEXT: movaps 32(%rdx), %xmm2 +; SSE-NEXT: movaps 48(%rdx), %xmm3 +; SSE-NEXT: paddb 16(%rdx), %xmm1 +; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movaps %xmm2, 32(%rcx) +; SSE-NEXT: movaps %xmm3, 48(%rcx) +; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: movdqa %xmm1, 16(%rcx) +; SSE-NEXT: retq +; +; AVX-LABEL: vec512_v2i256_to_v1i512_factor2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovaps 32(%rdx), %ymm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: vec512_v2i256_to_v1i512_factor2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec512_v2i256_to_v1i512_factor2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec512_v2i256_to_v1i512_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias + %in.vec.cast = bitcast <64 x i8> %in.vec to <2 x i256> + %zextd.vec = shufflevector <2 x i256> %in.vec.cast, <2 x i256> zeroinitializer, <2 x i32> + %out.bytevec = bitcast <2 x i256> %zextd.vec to <64 x i8> + %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 + %out.vec = add <64 x i8> %out.bytevec, %out.vec.bias + store <64 x i8> %out.vec, ptr %out.vec.ptr, align 64 + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX1-ONLY: {{.*}} +; FALLBACK0: {{.*}} +; FALLBACK1: {{.*}} +; FALLBACK2: {{.*}} +; FALLBACK3: {{.*}} +; FALLBACK4: {{.*}} +; FALLBACK5: {{.*}} +; FALLBACK6: {{.*}} +; FALLBACK7: {{.*}} +; FALLBACK8: {{.*}} -- 2.7.4