From 83f66e2d81f733e4bf956684d761e16d1c9200a4 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Sun, 15 Jan 2023 16:49:50 +0300 Subject: [PATCH] [NFC][X86] Add subvector splat-store tests https://godbolt.org/z/oGn7KqrKs --- .../X86/subvectorwise-store-of-vector-splat.ll | 6562 ++++++++++++++++++++ 1 file changed, 6562 insertions(+) create mode 100644 llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll new file mode 100644 index 0000000..10928cd --- /dev/null +++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll @@ -0,0 +1,6562 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=-sse2 | FileCheck %s --check-prefixes=ALL,SCALAR +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSE2-ONLY +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse3 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+ssse3 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSSE3,SSSE3-ONLY +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSSE3,SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSSE3,SSE42 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefixes=ALL,SSE,AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,SSE,AVX,AVX2,AVX2-ONLY +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl | FileCheck %s --check-prefixes=ALL,SSE,AVX,AVX2,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=ALL,SSE,AVX,AVX2,AVX512,AVX512BW + +define void @vec32_v2i8(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec32_v2i8: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subb (%rdi), %cl +; SCALAR-NEXT: subb 1(%rdi), %al +; SCALAR-NEXT: movb %al, 1(%rsi) +; SCALAR-NEXT: movb %cl, (%rsi) +; SCALAR-NEXT: movb %al, 3(%rsi) +; SCALAR-NEXT: movb %cl, 2(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec32_v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psubb (%rdi), %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movw %ax, (%rsi) +; SSE2-NEXT: movw %ax, 2(%rsi) +; SSE2-NEXT: retq +; +; AVX-LABEL: vec32_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsubb (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: movw %ax, (%rsi) +; AVX-NEXT: movw %ax, 2(%rsi) +; AVX-NEXT: retq + %in.subvec.neg = load <2 x i8>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <2 x i8> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0 + store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1 + store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2 + ret void +} + +define void @vec64_v2i8(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec64_v2i8: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subb (%rdi), %cl +; SCALAR-NEXT: subb 1(%rdi), %al +; SCALAR-NEXT: movb %al, 1(%rsi) +; SCALAR-NEXT: movb %cl, (%rsi) +; SCALAR-NEXT: movb %al, 3(%rsi) +; SCALAR-NEXT: movb %cl, 2(%rsi) +; SCALAR-NEXT: movb %al, 5(%rsi) +; SCALAR-NEXT: movb %cl, 4(%rsi) +; SCALAR-NEXT: movb %al, 7(%rsi) +; SCALAR-NEXT: movb %cl, 6(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec64_v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psubb (%rdi), %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movw %ax, (%rsi) +; SSE2-NEXT: movw %ax, 2(%rsi) +; SSE2-NEXT: movw %ax, 4(%rsi) +; SSE2-NEXT: movw %ax, 6(%rsi) +; SSE2-NEXT: retq +; +; AVX-LABEL: vec64_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsubb (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: movw %ax, (%rsi) +; AVX-NEXT: movw %ax, 2(%rsi) +; AVX-NEXT: movw %ax, 4(%rsi) +; AVX-NEXT: movw %ax, 6(%rsi) +; AVX-NEXT: retq + %in.subvec.neg = load <2 x i8>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <2 x i8> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0 + store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1 + store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2 + %out.subvec2.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 2 + store <2 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4 + %out.subvec3.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 3 + store <2 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2 + ret void +} + +define void @vec64_v2i16(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec64_v2i16: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subw (%rdi), %cx +; SCALAR-NEXT: subw 2(%rdi), %ax +; SCALAR-NEXT: movw %ax, 2(%rsi) +; SCALAR-NEXT: movw %cx, (%rsi) +; SCALAR-NEXT: movw %ax, 6(%rsi) +; SCALAR-NEXT: movw %cx, 4(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec64_v2i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psubw (%rdi), %xmm0 +; SSE2-NEXT: movd %xmm0, (%rsi) +; SSE2-NEXT: movd %xmm0, 4(%rsi) +; SSE2-NEXT: retq +; +; AVX-LABEL: vec64_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsubw (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: vmovd %xmm0, 4(%rsi) +; AVX-NEXT: retq + %in.subvec.neg = load <2 x i16>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <2 x i16> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 0 + store <2 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 1 + store <2 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4 + ret void +} + +define void @vec64_v4i8(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec64_v4i8: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subb (%rdi), %cl +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: subb 1(%rdi), %dl +; SCALAR-NEXT: xorl %r8d, %r8d +; SCALAR-NEXT: subb 2(%rdi), %r8b +; SCALAR-NEXT: subb 3(%rdi), %al +; SCALAR-NEXT: movb %al, 3(%rsi) +; SCALAR-NEXT: movb %r8b, 2(%rsi) +; SCALAR-NEXT: movb %dl, 1(%rsi) +; SCALAR-NEXT: movb %cl, (%rsi) +; SCALAR-NEXT: movb %al, 7(%rsi) +; SCALAR-NEXT: movb %r8b, 6(%rsi) +; SCALAR-NEXT: movb %dl, 5(%rsi) +; SCALAR-NEXT: movb %cl, 4(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec64_v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psubb (%rdi), %xmm0 +; SSE2-NEXT: movd %xmm0, (%rsi) +; SSE2-NEXT: movd %xmm0, 4(%rsi) +; SSE2-NEXT: retq +; +; AVX-LABEL: vec64_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsubb (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: vmovd %xmm0, 4(%rsi) +; AVX-NEXT: retq + %in.subvec.neg = load <4 x i8>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <4 x i8> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 0 + store <4 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 1 + store <4 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4 + ret void +} + +define void @vec128_v2i8(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec128_v2i8: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subb (%rdi), %cl +; SCALAR-NEXT: subb 1(%rdi), %al +; SCALAR-NEXT: movb %al, 1(%rsi) +; SCALAR-NEXT: movb %cl, (%rsi) +; SCALAR-NEXT: movb %al, 3(%rsi) +; SCALAR-NEXT: movb %cl, 2(%rsi) +; SCALAR-NEXT: movb %al, 5(%rsi) +; SCALAR-NEXT: movb %cl, 4(%rsi) +; SCALAR-NEXT: movb %al, 7(%rsi) +; SCALAR-NEXT: movb %cl, 6(%rsi) +; SCALAR-NEXT: movb %al, 9(%rsi) +; SCALAR-NEXT: movb %cl, 8(%rsi) +; SCALAR-NEXT: movb %al, 11(%rsi) +; SCALAR-NEXT: movb %cl, 10(%rsi) +; SCALAR-NEXT: movb %al, 13(%rsi) +; SCALAR-NEXT: movb %cl, 12(%rsi) +; SCALAR-NEXT: movb %al, 15(%rsi) +; SCALAR-NEXT: movb %cl, 14(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec128_v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec128_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec128_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: retq + %in.subvec.neg = load <2 x i8>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <2 x i8> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0 + store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1 + store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2 + %out.subvec2.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 2 + store <2 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4 + %out.subvec3.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 3 + store <2 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2 + %out.subvec4.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 4 + store <2 x i8> %in.subvec, ptr %out.subvec4.ptr, align 8 + %out.subvec5.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 5 + store <2 x i8> %in.subvec, ptr %out.subvec5.ptr, align 2 + %out.subvec6.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 6 + store <2 x i8> %in.subvec, ptr %out.subvec6.ptr, align 4 + %out.subvec7.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 7 + store <2 x i8> %in.subvec, ptr %out.subvec7.ptr, align 2 + ret void +} + +define void @vec128_v2i16(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec128_v2i16: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subw (%rdi), %cx +; SCALAR-NEXT: subw 2(%rdi), %ax +; SCALAR-NEXT: movw %ax, 2(%rsi) +; SCALAR-NEXT: movw %cx, (%rsi) +; SCALAR-NEXT: movw %ax, 6(%rsi) +; SCALAR-NEXT: movw %cx, 4(%rsi) +; SCALAR-NEXT: movw %ax, 10(%rsi) +; SCALAR-NEXT: movw %cx, 8(%rsi) +; SCALAR-NEXT: movw %ax, 14(%rsi) +; SCALAR-NEXT: movw %cx, 12(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec128_v2i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,0,0] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec128_v2i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,0,0,0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec128_v2i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd (%rdi), %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: retq + %in.subvec.neg = load <2 x i16>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <2 x i16> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 0 + store <2 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 1 + store <2 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4 + %out.subvec2.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 2 + store <2 x i16> %in.subvec, ptr %out.subvec2.ptr, align 8 + %out.subvec3.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 3 + store <2 x i16> %in.subvec, ptr %out.subvec3.ptr, align 4 + ret void +} + +define void @vec128_v2i32(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec128_v2i32: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subl (%rdi), %ecx +; SCALAR-NEXT: subl 4(%rdi), %eax +; SCALAR-NEXT: movl %eax, 4(%rsi) +; SCALAR-NEXT: movl %ecx, (%rsi) +; SCALAR-NEXT: movl %eax, 12(%rsi) +; SCALAR-NEXT: movl %ecx, 8(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec128_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubd %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec128_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec128_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: retq + %in.subvec.neg = load <2 x i32>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <2 x i32> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 0 + store <2 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 1 + store <2 x i32> %in.subvec, ptr %out.subvec1.ptr, align 8 + ret void +} + +define void @vec128_v2f32(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec128_v2f32: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subl (%rdi), %ecx +; SCALAR-NEXT: subl 4(%rdi), %eax +; SCALAR-NEXT: movl %eax, 4(%rsi) +; SCALAR-NEXT: movl %ecx, (%rsi) +; SCALAR-NEXT: movl %eax, 12(%rsi) +; SCALAR-NEXT: movl %ecx, 8(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec128_v2f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubd %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec128_v2f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec128_v2f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: retq + %in.subvec.neg = load <2 x i32>, ptr %in.subvec.ptr, align 64 + %in.subvec.int = sub <2 x i32> zeroinitializer, %in.subvec.neg + %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float> + %out.subvec0.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 0 + store <2 x float> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 1 + store <2 x float> %in.subvec, ptr %out.subvec1.ptr, align 8 + ret void +} + +define void @vec128_v4i8(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec128_v4i8: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subb (%rdi), %cl +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: subb 1(%rdi), %dl +; SCALAR-NEXT: xorl %r8d, %r8d +; SCALAR-NEXT: subb 2(%rdi), %r8b +; SCALAR-NEXT: subb 3(%rdi), %al +; SCALAR-NEXT: movb %al, 3(%rsi) +; SCALAR-NEXT: movb %r8b, 2(%rsi) +; SCALAR-NEXT: movb %dl, 1(%rsi) +; SCALAR-NEXT: movb %cl, (%rsi) +; SCALAR-NEXT: movb %al, 7(%rsi) +; SCALAR-NEXT: movb %r8b, 6(%rsi) +; SCALAR-NEXT: movb %dl, 5(%rsi) +; SCALAR-NEXT: movb %cl, 4(%rsi) +; SCALAR-NEXT: movb %al, 11(%rsi) +; SCALAR-NEXT: movb %r8b, 10(%rsi) +; SCALAR-NEXT: movb %dl, 9(%rsi) +; SCALAR-NEXT: movb %cl, 8(%rsi) +; SCALAR-NEXT: movb %al, 15(%rsi) +; SCALAR-NEXT: movb %r8b, 14(%rsi) +; SCALAR-NEXT: movb %dl, 13(%rsi) +; SCALAR-NEXT: movb %cl, 12(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec128_v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,0,0] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec128_v4i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,0,0,0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec128_v4i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd (%rdi), %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: retq + %in.subvec.neg = load <4 x i8>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <4 x i8> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 0 + store <4 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 1 + store <4 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4 + %out.subvec2.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 2 + store <4 x i8> %in.subvec, ptr %out.subvec2.ptr, align 8 + %out.subvec3.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 3 + store <4 x i8> %in.subvec, ptr %out.subvec3.ptr, align 4 + ret void +} + +define void @vec128_v4i16(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec128_v4i16: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subw (%rdi), %cx +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: subw 2(%rdi), %dx +; SCALAR-NEXT: xorl %r8d, %r8d +; SCALAR-NEXT: subw 4(%rdi), %r8w +; SCALAR-NEXT: subw 6(%rdi), %ax +; SCALAR-NEXT: movw %ax, 6(%rsi) +; SCALAR-NEXT: movw %r8w, 4(%rsi) +; SCALAR-NEXT: movw %dx, 2(%rsi) +; SCALAR-NEXT: movw %cx, (%rsi) +; SCALAR-NEXT: movw %ax, 14(%rsi) +; SCALAR-NEXT: movw %r8w, 12(%rsi) +; SCALAR-NEXT: movw %dx, 10(%rsi) +; SCALAR-NEXT: movw %cx, 8(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec128_v4i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec128_v4i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec128_v4i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: retq + %in.subvec.neg = load <4 x i16>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <4 x i16> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 0 + store <4 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 1 + store <4 x i16> %in.subvec, ptr %out.subvec1.ptr, align 8 + ret void +} + +define void @vec128_v8i8(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec128_v8i8: +; SCALAR: # %bb.0: +; SCALAR-NEXT: pushq %rbx +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb (%rdi), %al +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: subb 1(%rdi), %dl +; SCALAR-NEXT: xorl %r8d, %r8d +; SCALAR-NEXT: subb 2(%rdi), %r8b +; SCALAR-NEXT: xorl %r9d, %r9d +; SCALAR-NEXT: subb 3(%rdi), %r9b +; SCALAR-NEXT: xorl %r10d, %r10d +; SCALAR-NEXT: subb 4(%rdi), %r10b +; SCALAR-NEXT: xorl %r11d, %r11d +; SCALAR-NEXT: subb 5(%rdi), %r11b +; SCALAR-NEXT: xorl %ebx, %ebx +; SCALAR-NEXT: subb 6(%rdi), %bl +; SCALAR-NEXT: subb 7(%rdi), %cl +; SCALAR-NEXT: movb %cl, 7(%rsi) +; SCALAR-NEXT: movb %bl, 6(%rsi) +; SCALAR-NEXT: movb %r11b, 5(%rsi) +; SCALAR-NEXT: movb %r10b, 4(%rsi) +; SCALAR-NEXT: movb %r9b, 3(%rsi) +; SCALAR-NEXT: movb %r8b, 2(%rsi) +; SCALAR-NEXT: movb %dl, 1(%rsi) +; SCALAR-NEXT: movb %al, (%rsi) +; SCALAR-NEXT: movb %cl, 15(%rsi) +; SCALAR-NEXT: movb %bl, 14(%rsi) +; SCALAR-NEXT: movb %r11b, 13(%rsi) +; SCALAR-NEXT: movb %r10b, 12(%rsi) +; SCALAR-NEXT: movb %r9b, 11(%rsi) +; SCALAR-NEXT: movb %r8b, 10(%rsi) +; SCALAR-NEXT: movb %dl, 9(%rsi) +; SCALAR-NEXT: movb %al, 8(%rsi) +; SCALAR-NEXT: popq %rbx +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec128_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec128_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec128_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: retq + %in.subvec.neg = load <8 x i8>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <8 x i8> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 0 + store <8 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 1 + store <8 x i8> %in.subvec, ptr %out.subvec1.ptr, align 8 + ret void +} + +define void @vec256_v2i8(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec256_v2i8: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subb (%rdi), %cl +; SCALAR-NEXT: subb 1(%rdi), %al +; SCALAR-NEXT: movb %al, 1(%rsi) +; SCALAR-NEXT: movb %cl, (%rsi) +; SCALAR-NEXT: movb %al, 3(%rsi) +; SCALAR-NEXT: movb %cl, 2(%rsi) +; SCALAR-NEXT: movb %al, 5(%rsi) +; SCALAR-NEXT: movb %cl, 4(%rsi) +; SCALAR-NEXT: movb %al, 7(%rsi) +; SCALAR-NEXT: movb %cl, 6(%rsi) +; SCALAR-NEXT: movb %al, 9(%rsi) +; SCALAR-NEXT: movb %cl, 8(%rsi) +; SCALAR-NEXT: movb %al, 11(%rsi) +; SCALAR-NEXT: movb %cl, 10(%rsi) +; SCALAR-NEXT: movb %al, 13(%rsi) +; SCALAR-NEXT: movb %cl, 12(%rsi) +; SCALAR-NEXT: movb %al, 15(%rsi) +; SCALAR-NEXT: movb %cl, 14(%rsi) +; SCALAR-NEXT: movb %al, 17(%rsi) +; SCALAR-NEXT: movb %cl, 16(%rsi) +; SCALAR-NEXT: movb %al, 19(%rsi) +; SCALAR-NEXT: movb %cl, 18(%rsi) +; SCALAR-NEXT: movb %al, 21(%rsi) +; SCALAR-NEXT: movb %cl, 20(%rsi) +; SCALAR-NEXT: movb %al, 23(%rsi) +; SCALAR-NEXT: movb %cl, 22(%rsi) +; SCALAR-NEXT: movb %al, 25(%rsi) +; SCALAR-NEXT: movb %cl, 24(%rsi) +; SCALAR-NEXT: movb %al, 27(%rsi) +; SCALAR-NEXT: movb %cl, 26(%rsi) +; SCALAR-NEXT: movb %al, 29(%rsi) +; SCALAR-NEXT: movb %cl, 28(%rsi) +; SCALAR-NEXT: movb %al, 31(%rsi) +; SCALAR-NEXT: movb %cl, 30(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec256_v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm1, 16(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec256_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec256_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %in.subvec.neg = load <2 x i8>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <2 x i8> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0 + store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1 + store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2 + %out.subvec2.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 2 + store <2 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4 + %out.subvec3.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 3 + store <2 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2 + %out.subvec4.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 4 + store <2 x i8> %in.subvec, ptr %out.subvec4.ptr, align 8 + %out.subvec5.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 5 + store <2 x i8> %in.subvec, ptr %out.subvec5.ptr, align 2 + %out.subvec6.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 6 + store <2 x i8> %in.subvec, ptr %out.subvec6.ptr, align 4 + %out.subvec7.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 7 + store <2 x i8> %in.subvec, ptr %out.subvec7.ptr, align 2 + %out.subvec8.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 8 + store <2 x i8> %in.subvec, ptr %out.subvec8.ptr, align 16 + %out.subvec9.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 9 + store <2 x i8> %in.subvec, ptr %out.subvec9.ptr, align 2 + %out.subvec10.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 10 + store <2 x i8> %in.subvec, ptr %out.subvec10.ptr, align 4 + %out.subvec11.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 11 + store <2 x i8> %in.subvec, ptr %out.subvec11.ptr, align 2 + %out.subvec12.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 12 + store <2 x i8> %in.subvec, ptr %out.subvec12.ptr, align 8 + %out.subvec13.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 13 + store <2 x i8> %in.subvec, ptr %out.subvec13.ptr, align 2 + %out.subvec14.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 14 + store <2 x i8> %in.subvec, ptr %out.subvec14.ptr, align 4 + %out.subvec15.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 15 + store <2 x i8> %in.subvec, ptr %out.subvec15.ptr, align 2 + ret void +} + +define void @vec256_v2i16(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec256_v2i16: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subw (%rdi), %cx +; SCALAR-NEXT: subw 2(%rdi), %ax +; SCALAR-NEXT: movw %ax, 2(%rsi) +; SCALAR-NEXT: movw %cx, (%rsi) +; SCALAR-NEXT: movw %ax, 6(%rsi) +; SCALAR-NEXT: movw %cx, 4(%rsi) +; SCALAR-NEXT: movw %ax, 10(%rsi) +; SCALAR-NEXT: movw %cx, 8(%rsi) +; SCALAR-NEXT: movw %ax, 14(%rsi) +; SCALAR-NEXT: movw %cx, 12(%rsi) +; SCALAR-NEXT: movw %ax, 18(%rsi) +; SCALAR-NEXT: movw %cx, 16(%rsi) +; SCALAR-NEXT: movw %ax, 22(%rsi) +; SCALAR-NEXT: movw %cx, 20(%rsi) +; SCALAR-NEXT: movw %ax, 26(%rsi) +; SCALAR-NEXT: movw %cx, 24(%rsi) +; SCALAR-NEXT: movw %ax, 30(%rsi) +; SCALAR-NEXT: movw %cx, 28(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec256_v2i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,0,0] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm1, 16(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec256_v2i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,0,0,0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec256_v2i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpsubw (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %in.subvec.neg = load <2 x i16>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <2 x i16> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 0 + store <2 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 1 + store <2 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4 + %out.subvec2.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 2 + store <2 x i16> %in.subvec, ptr %out.subvec2.ptr, align 8 + %out.subvec3.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 3 + store <2 x i16> %in.subvec, ptr %out.subvec3.ptr, align 4 + %out.subvec4.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 4 + store <2 x i16> %in.subvec, ptr %out.subvec4.ptr, align 16 + %out.subvec5.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 5 + store <2 x i16> %in.subvec, ptr %out.subvec5.ptr, align 4 + %out.subvec6.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 6 + store <2 x i16> %in.subvec, ptr %out.subvec6.ptr, align 8 + %out.subvec7.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 7 + store <2 x i16> %in.subvec, ptr %out.subvec7.ptr, align 4 + ret void +} + +define void @vec256_v2i32(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec256_v2i32: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subl (%rdi), %ecx +; SCALAR-NEXT: subl 4(%rdi), %eax +; SCALAR-NEXT: movl %eax, 4(%rsi) +; SCALAR-NEXT: movl %ecx, (%rsi) +; SCALAR-NEXT: movl %eax, 12(%rsi) +; SCALAR-NEXT: movl %ecx, 8(%rsi) +; SCALAR-NEXT: movl %eax, 20(%rsi) +; SCALAR-NEXT: movl %ecx, 16(%rsi) +; SCALAR-NEXT: movl %eax, 28(%rsi) +; SCALAR-NEXT: movl %ecx, 24(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec256_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubd %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm1, 16(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec256_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec256_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %in.subvec.neg = load <2 x i32>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <2 x i32> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 0 + store <2 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 1 + store <2 x i32> %in.subvec, ptr %out.subvec1.ptr, align 8 + %out.subvec2.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 2 + store <2 x i32> %in.subvec, ptr %out.subvec2.ptr, align 16 + %out.subvec3.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 3 + store <2 x i32> %in.subvec, ptr %out.subvec3.ptr, align 8 + ret void +} + +define void @vec256_v2f32(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec256_v2f32: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subl (%rdi), %ecx +; SCALAR-NEXT: subl 4(%rdi), %eax +; SCALAR-NEXT: movl %eax, 4(%rsi) +; SCALAR-NEXT: movl %ecx, (%rsi) +; SCALAR-NEXT: movl %eax, 12(%rsi) +; SCALAR-NEXT: movl %ecx, 8(%rsi) +; SCALAR-NEXT: movl %eax, 20(%rsi) +; SCALAR-NEXT: movl %ecx, 16(%rsi) +; SCALAR-NEXT: movl %eax, 28(%rsi) +; SCALAR-NEXT: movl %ecx, 24(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec256_v2f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubd %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm1, 16(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec256_v2f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec256_v2f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %in.subvec.neg = load <2 x i32>, ptr %in.subvec.ptr, align 64 + %in.subvec.int = sub <2 x i32> zeroinitializer, %in.subvec.neg + %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float> + %out.subvec0.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 0 + store <2 x float> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 1 + store <2 x float> %in.subvec, ptr %out.subvec1.ptr, align 8 + %out.subvec2.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 2 + store <2 x float> %in.subvec, ptr %out.subvec2.ptr, align 16 + %out.subvec3.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 3 + store <2 x float> %in.subvec, ptr %out.subvec3.ptr, align 8 + ret void +} + +define void @vec256_v2i64(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec256_v2i64: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subq (%rdi), %rcx +; SCALAR-NEXT: subq 8(%rdi), %rax +; SCALAR-NEXT: movq %rax, 8(%rsi) +; SCALAR-NEXT: movq %rcx, (%rsi) +; SCALAR-NEXT: movq %rax, 24(%rsi) +; SCALAR-NEXT: movq %rcx, 16(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec256_v2i64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psubq (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: retq +; +; AVX-LABEL: vec256_v2i64: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsubq (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX-NEXT: retq + %in.subvec.neg = load <2 x i64>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <2 x i64> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 0 + store <2 x i64> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 1 + store <2 x i64> %in.subvec, ptr %out.subvec1.ptr, align 16 + ret void +} + +define void @vec256_v2f64(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec256_v2f64: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subq (%rdi), %rcx +; SCALAR-NEXT: subq 8(%rdi), %rax +; SCALAR-NEXT: movq %rax, 8(%rsi) +; SCALAR-NEXT: movq %rcx, (%rsi) +; SCALAR-NEXT: movq %rax, 24(%rsi) +; SCALAR-NEXT: movq %rcx, 16(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec256_v2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psubq (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: retq +; +; AVX-LABEL: vec256_v2f64: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsubq (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX-NEXT: retq + %in.subvec.neg = load <2 x i64>, ptr %in.subvec.ptr, align 64 + %in.subvec.int = sub <2 x i64> zeroinitializer, %in.subvec.neg + %in.subvec = bitcast <2 x i64> %in.subvec.int to <2 x double> + %out.subvec0.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 0 + store <2 x double> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 1 + store <2 x double> %in.subvec, ptr %out.subvec1.ptr, align 16 + ret void +} + +define void @vec256_v4i8(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec256_v4i8: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb (%rdi), %al +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: subb 1(%rdi), %dl +; SCALAR-NEXT: xorl %r8d, %r8d +; SCALAR-NEXT: subb 2(%rdi), %r8b +; SCALAR-NEXT: subb 3(%rdi), %cl +; SCALAR-NEXT: movb %cl, 3(%rsi) +; SCALAR-NEXT: movb %r8b, 2(%rsi) +; SCALAR-NEXT: movb %dl, 1(%rsi) +; SCALAR-NEXT: movb %al, (%rsi) +; SCALAR-NEXT: movb %cl, 7(%rsi) +; SCALAR-NEXT: movb %r8b, 6(%rsi) +; SCALAR-NEXT: movb %dl, 5(%rsi) +; SCALAR-NEXT: movb %al, 4(%rsi) +; SCALAR-NEXT: movb %cl, 11(%rsi) +; SCALAR-NEXT: movb %r8b, 10(%rsi) +; SCALAR-NEXT: movb %dl, 9(%rsi) +; SCALAR-NEXT: movb %al, 8(%rsi) +; SCALAR-NEXT: movb %cl, 15(%rsi) +; SCALAR-NEXT: movb %r8b, 14(%rsi) +; SCALAR-NEXT: movb %dl, 13(%rsi) +; SCALAR-NEXT: movb %al, 12(%rsi) +; SCALAR-NEXT: movb %cl, 19(%rsi) +; SCALAR-NEXT: movb %r8b, 18(%rsi) +; SCALAR-NEXT: movb %dl, 17(%rsi) +; SCALAR-NEXT: movb %al, 16(%rsi) +; SCALAR-NEXT: movb %cl, 23(%rsi) +; SCALAR-NEXT: movb %r8b, 22(%rsi) +; SCALAR-NEXT: movb %dl, 21(%rsi) +; SCALAR-NEXT: movb %al, 20(%rsi) +; SCALAR-NEXT: movb %cl, 27(%rsi) +; SCALAR-NEXT: movb %r8b, 26(%rsi) +; SCALAR-NEXT: movb %dl, 25(%rsi) +; SCALAR-NEXT: movb %al, 24(%rsi) +; SCALAR-NEXT: movb %cl, 31(%rsi) +; SCALAR-NEXT: movb %r8b, 30(%rsi) +; SCALAR-NEXT: movb %dl, 29(%rsi) +; SCALAR-NEXT: movb %al, 28(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec256_v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,0,0] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm1, 16(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec256_v4i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,0,0,0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec256_v4i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %in.subvec.neg = load <4 x i8>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <4 x i8> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 0 + store <4 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 1 + store <4 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4 + %out.subvec2.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 2 + store <4 x i8> %in.subvec, ptr %out.subvec2.ptr, align 8 + %out.subvec3.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 3 + store <4 x i8> %in.subvec, ptr %out.subvec3.ptr, align 4 + %out.subvec4.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 4 + store <4 x i8> %in.subvec, ptr %out.subvec4.ptr, align 16 + %out.subvec5.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 5 + store <4 x i8> %in.subvec, ptr %out.subvec5.ptr, align 4 + %out.subvec6.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 6 + store <4 x i8> %in.subvec, ptr %out.subvec6.ptr, align 8 + %out.subvec7.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 7 + store <4 x i8> %in.subvec, ptr %out.subvec7.ptr, align 4 + ret void +} + +define void @vec256_v4i16(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec256_v4i16: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subw (%rdi), %cx +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: subw 2(%rdi), %dx +; SCALAR-NEXT: xorl %r8d, %r8d +; SCALAR-NEXT: subw 4(%rdi), %r8w +; SCALAR-NEXT: subw 6(%rdi), %ax +; SCALAR-NEXT: movw %ax, 6(%rsi) +; SCALAR-NEXT: movw %r8w, 4(%rsi) +; SCALAR-NEXT: movw %dx, 2(%rsi) +; SCALAR-NEXT: movw %cx, (%rsi) +; SCALAR-NEXT: movw %ax, 14(%rsi) +; SCALAR-NEXT: movw %r8w, 12(%rsi) +; SCALAR-NEXT: movw %dx, 10(%rsi) +; SCALAR-NEXT: movw %cx, 8(%rsi) +; SCALAR-NEXT: movw %ax, 22(%rsi) +; SCALAR-NEXT: movw %r8w, 20(%rsi) +; SCALAR-NEXT: movw %dx, 18(%rsi) +; SCALAR-NEXT: movw %cx, 16(%rsi) +; SCALAR-NEXT: movw %ax, 30(%rsi) +; SCALAR-NEXT: movw %r8w, 28(%rsi) +; SCALAR-NEXT: movw %dx, 26(%rsi) +; SCALAR-NEXT: movw %cx, 24(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec256_v4i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm1, 16(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec256_v4i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec256_v4i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %in.subvec.neg = load <4 x i16>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <4 x i16> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 0 + store <4 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 1 + store <4 x i16> %in.subvec, ptr %out.subvec1.ptr, align 8 + %out.subvec2.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 2 + store <4 x i16> %in.subvec, ptr %out.subvec2.ptr, align 16 + %out.subvec3.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 3 + store <4 x i16> %in.subvec, ptr %out.subvec3.ptr, align 8 + ret void +} + +define void @vec256_v4i32(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec256_v4i32: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subl (%rdi), %ecx +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: subl 4(%rdi), %edx +; SCALAR-NEXT: xorl %r8d, %r8d +; SCALAR-NEXT: subl 8(%rdi), %r8d +; SCALAR-NEXT: subl 12(%rdi), %eax +; SCALAR-NEXT: movl %eax, 12(%rsi) +; SCALAR-NEXT: movl %r8d, 8(%rsi) +; SCALAR-NEXT: movl %edx, 4(%rsi) +; SCALAR-NEXT: movl %ecx, (%rsi) +; SCALAR-NEXT: movl %eax, 28(%rsi) +; SCALAR-NEXT: movl %r8d, 24(%rsi) +; SCALAR-NEXT: movl %edx, 20(%rsi) +; SCALAR-NEXT: movl %ecx, 16(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec256_v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psubd (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: retq +; +; AVX-LABEL: vec256_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsubd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX-NEXT: retq + %in.subvec.neg = load <4 x i32>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <4 x i32> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 0 + store <4 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 1 + store <4 x i32> %in.subvec, ptr %out.subvec1.ptr, align 16 + ret void +} + +define void @vec256_v4f32(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec256_v4f32: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subl (%rdi), %ecx +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: subl 4(%rdi), %edx +; SCALAR-NEXT: xorl %r8d, %r8d +; SCALAR-NEXT: subl 8(%rdi), %r8d +; SCALAR-NEXT: subl 12(%rdi), %eax +; SCALAR-NEXT: movl %eax, 12(%rsi) +; SCALAR-NEXT: movl %r8d, 8(%rsi) +; SCALAR-NEXT: movl %edx, 4(%rsi) +; SCALAR-NEXT: movl %ecx, (%rsi) +; SCALAR-NEXT: movl %eax, 28(%rsi) +; SCALAR-NEXT: movl %r8d, 24(%rsi) +; SCALAR-NEXT: movl %edx, 20(%rsi) +; SCALAR-NEXT: movl %ecx, 16(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec256_v4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psubd (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: retq +; +; AVX-LABEL: vec256_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsubd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX-NEXT: retq + %in.subvec.neg = load <4 x i32>, ptr %in.subvec.ptr, align 64 + %in.subvec.int = sub <4 x i32> zeroinitializer, %in.subvec.neg + %in.subvec = bitcast <4 x i32> %in.subvec.int to <4 x float> + %out.subvec0.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 0 + store <4 x float> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 1 + store <4 x float> %in.subvec, ptr %out.subvec1.ptr, align 16 + ret void +} + +define void @vec256_v8i8(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec256_v8i8: +; SCALAR: # %bb.0: +; SCALAR-NEXT: pushq %rbx +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb (%rdi), %al +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subb 1(%rdi), %cl +; SCALAR-NEXT: xorl %r8d, %r8d +; SCALAR-NEXT: subb 2(%rdi), %r8b +; SCALAR-NEXT: xorl %r9d, %r9d +; SCALAR-NEXT: subb 3(%rdi), %r9b +; SCALAR-NEXT: xorl %r10d, %r10d +; SCALAR-NEXT: subb 4(%rdi), %r10b +; SCALAR-NEXT: xorl %r11d, %r11d +; SCALAR-NEXT: subb 5(%rdi), %r11b +; SCALAR-NEXT: xorl %ebx, %ebx +; SCALAR-NEXT: subb 6(%rdi), %bl +; SCALAR-NEXT: subb 7(%rdi), %dl +; SCALAR-NEXT: movb %dl, 7(%rsi) +; SCALAR-NEXT: movb %bl, 6(%rsi) +; SCALAR-NEXT: movb %r11b, 5(%rsi) +; SCALAR-NEXT: movb %r10b, 4(%rsi) +; SCALAR-NEXT: movb %r9b, 3(%rsi) +; SCALAR-NEXT: movb %r8b, 2(%rsi) +; SCALAR-NEXT: movb %cl, 1(%rsi) +; SCALAR-NEXT: movb %al, (%rsi) +; SCALAR-NEXT: movb %dl, 15(%rsi) +; SCALAR-NEXT: movb %bl, 14(%rsi) +; SCALAR-NEXT: movb %r11b, 13(%rsi) +; SCALAR-NEXT: movb %r10b, 12(%rsi) +; SCALAR-NEXT: movb %r9b, 11(%rsi) +; SCALAR-NEXT: movb %r8b, 10(%rsi) +; SCALAR-NEXT: movb %cl, 9(%rsi) +; SCALAR-NEXT: movb %al, 8(%rsi) +; SCALAR-NEXT: movb %dl, 23(%rsi) +; SCALAR-NEXT: movb %bl, 22(%rsi) +; SCALAR-NEXT: movb %r11b, 21(%rsi) +; SCALAR-NEXT: movb %r10b, 20(%rsi) +; SCALAR-NEXT: movb %r9b, 19(%rsi) +; SCALAR-NEXT: movb %r8b, 18(%rsi) +; SCALAR-NEXT: movb %cl, 17(%rsi) +; SCALAR-NEXT: movb %al, 16(%rsi) +; SCALAR-NEXT: movb %dl, 31(%rsi) +; SCALAR-NEXT: movb %bl, 30(%rsi) +; SCALAR-NEXT: movb %r11b, 29(%rsi) +; SCALAR-NEXT: movb %r10b, 28(%rsi) +; SCALAR-NEXT: movb %r9b, 27(%rsi) +; SCALAR-NEXT: movb %r8b, 26(%rsi) +; SCALAR-NEXT: movb %cl, 25(%rsi) +; SCALAR-NEXT: movb %al, 24(%rsi) +; SCALAR-NEXT: popq %rbx +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec256_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm1, 16(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec256_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec256_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %in.subvec.neg = load <8 x i8>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <8 x i8> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 0 + store <8 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 1 + store <8 x i8> %in.subvec, ptr %out.subvec1.ptr, align 8 + %out.subvec2.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 2 + store <8 x i8> %in.subvec, ptr %out.subvec2.ptr, align 16 + %out.subvec3.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 3 + store <8 x i8> %in.subvec, ptr %out.subvec3.ptr, align 8 + ret void +} + +define void @vec256_v8i16(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec256_v8i16: +; SCALAR: # %bb.0: +; SCALAR-NEXT: pushq %rbx +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subw (%rdi), %ax +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: subw 2(%rdi), %dx +; SCALAR-NEXT: xorl %r8d, %r8d +; SCALAR-NEXT: subw 4(%rdi), %r8w +; SCALAR-NEXT: xorl %r9d, %r9d +; SCALAR-NEXT: subw 6(%rdi), %r9w +; SCALAR-NEXT: xorl %r10d, %r10d +; SCALAR-NEXT: subw 8(%rdi), %r10w +; SCALAR-NEXT: xorl %r11d, %r11d +; SCALAR-NEXT: subw 10(%rdi), %r11w +; SCALAR-NEXT: xorl %ebx, %ebx +; SCALAR-NEXT: subw 12(%rdi), %bx +; SCALAR-NEXT: subw 14(%rdi), %cx +; SCALAR-NEXT: movw %cx, 14(%rsi) +; SCALAR-NEXT: movw %bx, 12(%rsi) +; SCALAR-NEXT: movw %r11w, 10(%rsi) +; SCALAR-NEXT: movw %r10w, 8(%rsi) +; SCALAR-NEXT: movw %r9w, 6(%rsi) +; SCALAR-NEXT: movw %r8w, 4(%rsi) +; SCALAR-NEXT: movw %dx, 2(%rsi) +; SCALAR-NEXT: movw %ax, (%rsi) +; SCALAR-NEXT: movw %cx, 30(%rsi) +; SCALAR-NEXT: movw %bx, 28(%rsi) +; SCALAR-NEXT: movw %r11w, 26(%rsi) +; SCALAR-NEXT: movw %r10w, 24(%rsi) +; SCALAR-NEXT: movw %r9w, 22(%rsi) +; SCALAR-NEXT: movw %r8w, 20(%rsi) +; SCALAR-NEXT: movw %dx, 18(%rsi) +; SCALAR-NEXT: movw %ax, 16(%rsi) +; SCALAR-NEXT: popq %rbx +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec256_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psubw (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: retq +; +; AVX-LABEL: vec256_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsubw (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX-NEXT: retq + %in.subvec.neg = load <8 x i16>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <8 x i16> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 0 + store <8 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 1 + store <8 x i16> %in.subvec, ptr %out.subvec1.ptr, align 16 + ret void +} + +define void @vec256_v16i8(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec256_v16i8: +; SCALAR: # %bb.0: +; SCALAR-NEXT: pushq %rbp +; SCALAR-NEXT: pushq %r15 +; SCALAR-NEXT: pushq %r14 +; SCALAR-NEXT: pushq %r13 +; SCALAR-NEXT: pushq %r12 +; SCALAR-NEXT: pushq %rbx +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb (%rdi), %al +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 1(%rdi), %al +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 2(%rdi), %al +; SCALAR-NEXT: movl %eax, %r8d +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 3(%rdi), %al +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 4(%rdi), %al +; SCALAR-NEXT: movl %eax, %r9d +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %r10d, %r10d +; SCALAR-NEXT: subb 5(%rdi), %r10b +; SCALAR-NEXT: xorl %r11d, %r11d +; SCALAR-NEXT: subb 6(%rdi), %r11b +; SCALAR-NEXT: xorl %ebx, %ebx +; SCALAR-NEXT: subb 7(%rdi), %bl +; SCALAR-NEXT: xorl %ebp, %ebp +; SCALAR-NEXT: subb 8(%rdi), %bpl +; SCALAR-NEXT: xorl %r14d, %r14d +; SCALAR-NEXT: subb 9(%rdi), %r14b +; SCALAR-NEXT: xorl %r15d, %r15d +; SCALAR-NEXT: subb 10(%rdi), %r15b +; SCALAR-NEXT: xorl %r12d, %r12d +; SCALAR-NEXT: subb 11(%rdi), %r12b +; SCALAR-NEXT: xorl %r13d, %r13d +; SCALAR-NEXT: subb 12(%rdi), %r13b +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: subb 13(%rdi), %dl +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subb 14(%rdi), %cl +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 15(%rdi), %al +; SCALAR-NEXT: movb %al, 15(%rsi) +; SCALAR-NEXT: movb %cl, 14(%rsi) +; SCALAR-NEXT: movb %dl, 13(%rsi) +; SCALAR-NEXT: movb %r13b, 12(%rsi) +; SCALAR-NEXT: movb %r12b, 11(%rsi) +; SCALAR-NEXT: movb %r15b, 10(%rsi) +; SCALAR-NEXT: movb %r14b, 9(%rsi) +; SCALAR-NEXT: movb %bpl, 8(%rsi) +; SCALAR-NEXT: movb %bl, 7(%rsi) +; SCALAR-NEXT: movb %r11b, 6(%rsi) +; SCALAR-NEXT: movb %r10b, 5(%rsi) +; SCALAR-NEXT: movb %r9b, 4(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload +; SCALAR-NEXT: movb %r9b, 3(%rsi) +; SCALAR-NEXT: movb %r8b, 2(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload +; SCALAR-NEXT: movb %r8b, 1(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; SCALAR-NEXT: movb %dil, (%rsi) +; SCALAR-NEXT: movb %al, 31(%rsi) +; SCALAR-NEXT: movb %cl, 30(%rsi) +; SCALAR-NEXT: movb %dl, 29(%rsi) +; SCALAR-NEXT: movb %r13b, 28(%rsi) +; SCALAR-NEXT: movb %r12b, 27(%rsi) +; SCALAR-NEXT: movb %r15b, 26(%rsi) +; SCALAR-NEXT: movb %r14b, 25(%rsi) +; SCALAR-NEXT: movb %bpl, 24(%rsi) +; SCALAR-NEXT: movb %bl, 23(%rsi) +; SCALAR-NEXT: movb %r11b, 22(%rsi) +; SCALAR-NEXT: movb %r10b, 21(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SCALAR-NEXT: movb %al, 20(%rsi) +; SCALAR-NEXT: movb %r9b, 19(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SCALAR-NEXT: movb %al, 18(%rsi) +; SCALAR-NEXT: movb %r8b, 17(%rsi) +; SCALAR-NEXT: movb %dil, 16(%rsi) +; SCALAR-NEXT: popq %rbx +; SCALAR-NEXT: popq %r12 +; SCALAR-NEXT: popq %r13 +; SCALAR-NEXT: popq %r14 +; SCALAR-NEXT: popq %r15 +; SCALAR-NEXT: popq %rbp +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec256_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psubb (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: retq +; +; AVX-LABEL: vec256_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsubb (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX-NEXT: retq + %in.subvec.neg = load <16 x i8>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <16 x i8> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 0 + store <16 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 1 + store <16 x i8> %in.subvec, ptr %out.subvec1.ptr, align 16 + ret void +} + +define void @vec384_v2i8(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec384_v2i8: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subb (%rdi), %cl +; SCALAR-NEXT: subb 1(%rdi), %al +; SCALAR-NEXT: movb %al, 1(%rsi) +; SCALAR-NEXT: movb %cl, (%rsi) +; SCALAR-NEXT: movb %al, 3(%rsi) +; SCALAR-NEXT: movb %cl, 2(%rsi) +; SCALAR-NEXT: movb %al, 5(%rsi) +; SCALAR-NEXT: movb %cl, 4(%rsi) +; SCALAR-NEXT: movb %al, 7(%rsi) +; SCALAR-NEXT: movb %cl, 6(%rsi) +; SCALAR-NEXT: movb %al, 9(%rsi) +; SCALAR-NEXT: movb %cl, 8(%rsi) +; SCALAR-NEXT: movb %al, 11(%rsi) +; SCALAR-NEXT: movb %cl, 10(%rsi) +; SCALAR-NEXT: movb %al, 13(%rsi) +; SCALAR-NEXT: movb %cl, 12(%rsi) +; SCALAR-NEXT: movb %al, 15(%rsi) +; SCALAR-NEXT: movb %cl, 14(%rsi) +; SCALAR-NEXT: movb %al, 17(%rsi) +; SCALAR-NEXT: movb %cl, 16(%rsi) +; SCALAR-NEXT: movb %al, 19(%rsi) +; SCALAR-NEXT: movb %cl, 18(%rsi) +; SCALAR-NEXT: movb %al, 21(%rsi) +; SCALAR-NEXT: movb %cl, 20(%rsi) +; SCALAR-NEXT: movb %al, 23(%rsi) +; SCALAR-NEXT: movb %cl, 22(%rsi) +; SCALAR-NEXT: movb %al, 25(%rsi) +; SCALAR-NEXT: movb %cl, 24(%rsi) +; SCALAR-NEXT: movb %al, 27(%rsi) +; SCALAR-NEXT: movb %cl, 26(%rsi) +; SCALAR-NEXT: movb %al, 29(%rsi) +; SCALAR-NEXT: movb %cl, 28(%rsi) +; SCALAR-NEXT: movb %al, 31(%rsi) +; SCALAR-NEXT: movb %cl, 30(%rsi) +; SCALAR-NEXT: movb %al, 33(%rsi) +; SCALAR-NEXT: movb %cl, 32(%rsi) +; SCALAR-NEXT: movb %al, 35(%rsi) +; SCALAR-NEXT: movb %cl, 34(%rsi) +; SCALAR-NEXT: movb %al, 37(%rsi) +; SCALAR-NEXT: movb %cl, 36(%rsi) +; SCALAR-NEXT: movb %al, 39(%rsi) +; SCALAR-NEXT: movb %cl, 38(%rsi) +; SCALAR-NEXT: movb %al, 41(%rsi) +; SCALAR-NEXT: movb %cl, 40(%rsi) +; SCALAR-NEXT: movb %al, 43(%rsi) +; SCALAR-NEXT: movb %cl, 42(%rsi) +; SCALAR-NEXT: movb %al, 45(%rsi) +; SCALAR-NEXT: movb %cl, 44(%rsi) +; SCALAR-NEXT: movb %al, 47(%rsi) +; SCALAR-NEXT: movb %cl, 46(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec384_v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm1, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, 32(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec384_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 +; AVX1-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec384_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %in.subvec.neg = load <2 x i8>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <2 x i8> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0 + store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1 + store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2 + %out.subvec2.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 2 + store <2 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4 + %out.subvec3.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 3 + store <2 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2 + %out.subvec4.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 4 + store <2 x i8> %in.subvec, ptr %out.subvec4.ptr, align 8 + %out.subvec5.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 5 + store <2 x i8> %in.subvec, ptr %out.subvec5.ptr, align 2 + %out.subvec6.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 6 + store <2 x i8> %in.subvec, ptr %out.subvec6.ptr, align 4 + %out.subvec7.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 7 + store <2 x i8> %in.subvec, ptr %out.subvec7.ptr, align 2 + %out.subvec8.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 8 + store <2 x i8> %in.subvec, ptr %out.subvec8.ptr, align 16 + %out.subvec9.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 9 + store <2 x i8> %in.subvec, ptr %out.subvec9.ptr, align 2 + %out.subvec10.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 10 + store <2 x i8> %in.subvec, ptr %out.subvec10.ptr, align 4 + %out.subvec11.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 11 + store <2 x i8> %in.subvec, ptr %out.subvec11.ptr, align 2 + %out.subvec12.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 12 + store <2 x i8> %in.subvec, ptr %out.subvec12.ptr, align 8 + %out.subvec13.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 13 + store <2 x i8> %in.subvec, ptr %out.subvec13.ptr, align 2 + %out.subvec14.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 14 + store <2 x i8> %in.subvec, ptr %out.subvec14.ptr, align 4 + %out.subvec15.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 15 + store <2 x i8> %in.subvec, ptr %out.subvec15.ptr, align 2 + %out.subvec16.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 16 + store <2 x i8> %in.subvec, ptr %out.subvec16.ptr, align 32 + %out.subvec17.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 17 + store <2 x i8> %in.subvec, ptr %out.subvec17.ptr, align 2 + %out.subvec18.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 18 + store <2 x i8> %in.subvec, ptr %out.subvec18.ptr, align 4 + %out.subvec19.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 19 + store <2 x i8> %in.subvec, ptr %out.subvec19.ptr, align 2 + %out.subvec20.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 20 + store <2 x i8> %in.subvec, ptr %out.subvec20.ptr, align 8 + %out.subvec21.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 21 + store <2 x i8> %in.subvec, ptr %out.subvec21.ptr, align 2 + %out.subvec22.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 22 + store <2 x i8> %in.subvec, ptr %out.subvec22.ptr, align 4 + %out.subvec23.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 23 + store <2 x i8> %in.subvec, ptr %out.subvec23.ptr, align 2 + ret void +} + +define void @vec384_v2i16(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec384_v2i16: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subw (%rdi), %cx +; SCALAR-NEXT: subw 2(%rdi), %ax +; SCALAR-NEXT: movw %ax, 2(%rsi) +; SCALAR-NEXT: movw %cx, (%rsi) +; SCALAR-NEXT: movw %ax, 6(%rsi) +; SCALAR-NEXT: movw %cx, 4(%rsi) +; SCALAR-NEXT: movw %ax, 10(%rsi) +; SCALAR-NEXT: movw %cx, 8(%rsi) +; SCALAR-NEXT: movw %ax, 14(%rsi) +; SCALAR-NEXT: movw %cx, 12(%rsi) +; SCALAR-NEXT: movw %ax, 18(%rsi) +; SCALAR-NEXT: movw %cx, 16(%rsi) +; SCALAR-NEXT: movw %ax, 22(%rsi) +; SCALAR-NEXT: movw %cx, 20(%rsi) +; SCALAR-NEXT: movw %ax, 26(%rsi) +; SCALAR-NEXT: movw %cx, 24(%rsi) +; SCALAR-NEXT: movw %ax, 30(%rsi) +; SCALAR-NEXT: movw %cx, 28(%rsi) +; SCALAR-NEXT: movw %ax, 34(%rsi) +; SCALAR-NEXT: movw %cx, 32(%rsi) +; SCALAR-NEXT: movw %ax, 38(%rsi) +; SCALAR-NEXT: movw %cx, 36(%rsi) +; SCALAR-NEXT: movw %ax, 42(%rsi) +; SCALAR-NEXT: movw %cx, 40(%rsi) +; SCALAR-NEXT: movw %ax, 46(%rsi) +; SCALAR-NEXT: movw %cx, 44(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec384_v2i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,0,0] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm1, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, 32(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec384_v2i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,0,0,0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 +; AVX1-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec384_v2i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpsubw (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %in.subvec.neg = load <2 x i16>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <2 x i16> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 0 + store <2 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 1 + store <2 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4 + %out.subvec2.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 2 + store <2 x i16> %in.subvec, ptr %out.subvec2.ptr, align 8 + %out.subvec3.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 3 + store <2 x i16> %in.subvec, ptr %out.subvec3.ptr, align 4 + %out.subvec4.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 4 + store <2 x i16> %in.subvec, ptr %out.subvec4.ptr, align 16 + %out.subvec5.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 5 + store <2 x i16> %in.subvec, ptr %out.subvec5.ptr, align 4 + %out.subvec6.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 6 + store <2 x i16> %in.subvec, ptr %out.subvec6.ptr, align 8 + %out.subvec7.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 7 + store <2 x i16> %in.subvec, ptr %out.subvec7.ptr, align 4 + %out.subvec8.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 8 + store <2 x i16> %in.subvec, ptr %out.subvec8.ptr, align 32 + %out.subvec9.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 9 + store <2 x i16> %in.subvec, ptr %out.subvec9.ptr, align 4 + %out.subvec10.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 10 + store <2 x i16> %in.subvec, ptr %out.subvec10.ptr, align 8 + %out.subvec11.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 11 + store <2 x i16> %in.subvec, ptr %out.subvec11.ptr, align 4 + ret void +} + +define void @vec384_v2i32(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec384_v2i32: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subl (%rdi), %ecx +; SCALAR-NEXT: subl 4(%rdi), %eax +; SCALAR-NEXT: movl %eax, 4(%rsi) +; SCALAR-NEXT: movl %ecx, (%rsi) +; SCALAR-NEXT: movl %eax, 12(%rsi) +; SCALAR-NEXT: movl %ecx, 8(%rsi) +; SCALAR-NEXT: movl %eax, 20(%rsi) +; SCALAR-NEXT: movl %ecx, 16(%rsi) +; SCALAR-NEXT: movl %eax, 28(%rsi) +; SCALAR-NEXT: movl %ecx, 24(%rsi) +; SCALAR-NEXT: movl %eax, 36(%rsi) +; SCALAR-NEXT: movl %ecx, 32(%rsi) +; SCALAR-NEXT: movl %eax, 44(%rsi) +; SCALAR-NEXT: movl %ecx, 40(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec384_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubd %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm1, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, 32(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec384_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 +; AVX1-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec384_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %in.subvec.neg = load <2 x i32>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <2 x i32> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 0 + store <2 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 1 + store <2 x i32> %in.subvec, ptr %out.subvec1.ptr, align 8 + %out.subvec2.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 2 + store <2 x i32> %in.subvec, ptr %out.subvec2.ptr, align 16 + %out.subvec3.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 3 + store <2 x i32> %in.subvec, ptr %out.subvec3.ptr, align 8 + %out.subvec4.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 4 + store <2 x i32> %in.subvec, ptr %out.subvec4.ptr, align 32 + %out.subvec5.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 5 + store <2 x i32> %in.subvec, ptr %out.subvec5.ptr, align 8 + ret void +} + +define void @vec384_v2f32(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec384_v2f32: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subl (%rdi), %ecx +; SCALAR-NEXT: subl 4(%rdi), %eax +; SCALAR-NEXT: movl %eax, 4(%rsi) +; SCALAR-NEXT: movl %ecx, (%rsi) +; SCALAR-NEXT: movl %eax, 12(%rsi) +; SCALAR-NEXT: movl %ecx, 8(%rsi) +; SCALAR-NEXT: movl %eax, 20(%rsi) +; SCALAR-NEXT: movl %ecx, 16(%rsi) +; SCALAR-NEXT: movl %eax, 28(%rsi) +; SCALAR-NEXT: movl %ecx, 24(%rsi) +; SCALAR-NEXT: movl %eax, 36(%rsi) +; SCALAR-NEXT: movl %ecx, 32(%rsi) +; SCALAR-NEXT: movl %eax, 44(%rsi) +; SCALAR-NEXT: movl %ecx, 40(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec384_v2f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubd %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm1, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, 32(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec384_v2f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 +; AVX1-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec384_v2f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %in.subvec.neg = load <2 x i32>, ptr %in.subvec.ptr, align 64 + %in.subvec.int = sub <2 x i32> zeroinitializer, %in.subvec.neg + %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float> + %out.subvec0.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 0 + store <2 x float> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 1 + store <2 x float> %in.subvec, ptr %out.subvec1.ptr, align 8 + %out.subvec2.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 2 + store <2 x float> %in.subvec, ptr %out.subvec2.ptr, align 16 + %out.subvec3.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 3 + store <2 x float> %in.subvec, ptr %out.subvec3.ptr, align 8 + %out.subvec4.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 4 + store <2 x float> %in.subvec, ptr %out.subvec4.ptr, align 32 + %out.subvec5.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 5 + store <2 x float> %in.subvec, ptr %out.subvec5.ptr, align 8 + ret void +} + +define void @vec384_v2i64(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec384_v2i64: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subq (%rdi), %rcx +; SCALAR-NEXT: subq 8(%rdi), %rax +; SCALAR-NEXT: movq %rax, 8(%rsi) +; SCALAR-NEXT: movq %rcx, (%rsi) +; SCALAR-NEXT: movq %rax, 24(%rsi) +; SCALAR-NEXT: movq %rcx, 16(%rsi) +; SCALAR-NEXT: movq %rax, 40(%rsi) +; SCALAR-NEXT: movq %rcx, 32(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec384_v2i64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psubq (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: movdqa %xmm0, 32(%rsi) +; SSE2-NEXT: retq +; +; AVX-LABEL: vec384_v2i64: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsubq (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX-NEXT: retq + %in.subvec.neg = load <2 x i64>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <2 x i64> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 0 + store <2 x i64> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 1 + store <2 x i64> %in.subvec, ptr %out.subvec1.ptr, align 16 + %out.subvec2.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 2 + store <2 x i64> %in.subvec, ptr %out.subvec2.ptr, align 32 + ret void +} + +define void @vec384_v2f64(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec384_v2f64: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subq (%rdi), %rcx +; SCALAR-NEXT: subq 8(%rdi), %rax +; SCALAR-NEXT: movq %rax, 8(%rsi) +; SCALAR-NEXT: movq %rcx, (%rsi) +; SCALAR-NEXT: movq %rax, 24(%rsi) +; SCALAR-NEXT: movq %rcx, 16(%rsi) +; SCALAR-NEXT: movq %rax, 40(%rsi) +; SCALAR-NEXT: movq %rcx, 32(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec384_v2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psubq (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: movdqa %xmm0, 32(%rsi) +; SSE2-NEXT: retq +; +; AVX-LABEL: vec384_v2f64: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsubq (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX-NEXT: retq + %in.subvec.neg = load <2 x i64>, ptr %in.subvec.ptr, align 64 + %in.subvec.int = sub <2 x i64> zeroinitializer, %in.subvec.neg + %in.subvec = bitcast <2 x i64> %in.subvec.int to <2 x double> + %out.subvec0.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 0 + store <2 x double> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 1 + store <2 x double> %in.subvec, ptr %out.subvec1.ptr, align 16 + %out.subvec2.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 2 + store <2 x double> %in.subvec, ptr %out.subvec2.ptr, align 32 + ret void +} + +define void @vec384_v3i8(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec384_v3i8: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movl (%rdi), %ecx +; SCALAR-NEXT: movl %ecx, %eax +; SCALAR-NEXT: shrl $16, %eax +; SCALAR-NEXT: movl %ecx, %edx +; SCALAR-NEXT: shrl $8, %edx +; SCALAR-NEXT: negb %cl +; SCALAR-NEXT: movzbl %cl, %edi +; SCALAR-NEXT: negb %dl +; SCALAR-NEXT: movzbl %dl, %ecx +; SCALAR-NEXT: shll $8, %ecx +; SCALAR-NEXT: orl %edi, %ecx +; SCALAR-NEXT: negb %al +; SCALAR-NEXT: movb %al, 2(%rsi) +; SCALAR-NEXT: movw %cx, (%rsi) +; SCALAR-NEXT: movb %al, 6(%rsi) +; SCALAR-NEXT: movw %cx, 4(%rsi) +; SCALAR-NEXT: movb %al, 10(%rsi) +; SCALAR-NEXT: movw %cx, 8(%rsi) +; SCALAR-NEXT: movb %al, 14(%rsi) +; SCALAR-NEXT: movw %cx, 12(%rsi) +; SCALAR-NEXT: movb %al, 18(%rsi) +; SCALAR-NEXT: movw %cx, 16(%rsi) +; SCALAR-NEXT: movb %al, 22(%rsi) +; SCALAR-NEXT: movw %cx, 20(%rsi) +; SCALAR-NEXT: movb %al, 26(%rsi) +; SCALAR-NEXT: movw %cx, 24(%rsi) +; SCALAR-NEXT: movb %al, 30(%rsi) +; SCALAR-NEXT: movw %cx, 28(%rsi) +; SCALAR-NEXT: movb %al, 34(%rsi) +; SCALAR-NEXT: movw %cx, 32(%rsi) +; SCALAR-NEXT: movb %al, 38(%rsi) +; SCALAR-NEXT: movw %cx, 36(%rsi) +; SCALAR-NEXT: movb %al, 42(%rsi) +; SCALAR-NEXT: movw %cx, 40(%rsi) +; SCALAR-NEXT: movb %al, 46(%rsi) +; SCALAR-NEXT: movw %cx, 44(%rsi) +; SCALAR-NEXT: movb %al, 50(%rsi) +; SCALAR-NEXT: movw %cx, 48(%rsi) +; SCALAR-NEXT: movb %al, 54(%rsi) +; SCALAR-NEXT: movw %cx, 52(%rsi) +; SCALAR-NEXT: movb %al, 58(%rsi) +; SCALAR-NEXT: movw %cx, 56(%rsi) +; SCALAR-NEXT: movb %al, 62(%rsi) +; SCALAR-NEXT: movw %cx, 60(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-ONLY-LABEL: vec384_v3i8: +; SSE2-ONLY: # %bb.0: +; SSE2-ONLY-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-ONLY-NEXT: pxor %xmm1, %xmm1 +; SSE2-ONLY-NEXT: psubb %xmm0, %xmm1 +; SSE2-ONLY-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-ONLY-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-ONLY-NEXT: movb %al, 2(%rsi) +; SSE2-ONLY-NEXT: movd %xmm1, %ecx +; SSE2-ONLY-NEXT: movw %cx, (%rsi) +; SSE2-ONLY-NEXT: movb %al, 6(%rsi) +; SSE2-ONLY-NEXT: movw %cx, 4(%rsi) +; SSE2-ONLY-NEXT: movb %al, 10(%rsi) +; SSE2-ONLY-NEXT: movw %cx, 8(%rsi) +; SSE2-ONLY-NEXT: movb %al, 14(%rsi) +; SSE2-ONLY-NEXT: movw %cx, 12(%rsi) +; SSE2-ONLY-NEXT: movb %al, 18(%rsi) +; SSE2-ONLY-NEXT: movw %cx, 16(%rsi) +; SSE2-ONLY-NEXT: movb %al, 22(%rsi) +; SSE2-ONLY-NEXT: movw %cx, 20(%rsi) +; SSE2-ONLY-NEXT: movb %al, 26(%rsi) +; SSE2-ONLY-NEXT: movw %cx, 24(%rsi) +; SSE2-ONLY-NEXT: movb %al, 30(%rsi) +; SSE2-ONLY-NEXT: movw %cx, 28(%rsi) +; SSE2-ONLY-NEXT: movb %al, 34(%rsi) +; SSE2-ONLY-NEXT: movw %cx, 32(%rsi) +; SSE2-ONLY-NEXT: movb %al, 38(%rsi) +; SSE2-ONLY-NEXT: movw %cx, 36(%rsi) +; SSE2-ONLY-NEXT: movb %al, 42(%rsi) +; SSE2-ONLY-NEXT: movw %cx, 40(%rsi) +; SSE2-ONLY-NEXT: movb %al, 46(%rsi) +; SSE2-ONLY-NEXT: movw %cx, 44(%rsi) +; SSE2-ONLY-NEXT: movb %al, 50(%rsi) +; SSE2-ONLY-NEXT: movw %cx, 48(%rsi) +; SSE2-ONLY-NEXT: movb %al, 54(%rsi) +; SSE2-ONLY-NEXT: movw %cx, 52(%rsi) +; SSE2-ONLY-NEXT: movb %al, 58(%rsi) +; SSE2-ONLY-NEXT: movw %cx, 56(%rsi) +; SSE2-ONLY-NEXT: movb %al, 62(%rsi) +; SSE2-ONLY-NEXT: movw %cx, 60(%rsi) +; SSE2-ONLY-NEXT: retq +; +; SSE3-LABEL: vec384_v3i8: +; SSE3: # %bb.0: +; SSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: psubb %xmm0, %xmm1 +; SSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movb %al, 2(%rsi) +; SSE3-NEXT: movd %xmm1, %ecx +; SSE3-NEXT: movw %cx, (%rsi) +; SSE3-NEXT: movb %al, 6(%rsi) +; SSE3-NEXT: movw %cx, 4(%rsi) +; SSE3-NEXT: movb %al, 10(%rsi) +; SSE3-NEXT: movw %cx, 8(%rsi) +; SSE3-NEXT: movb %al, 14(%rsi) +; SSE3-NEXT: movw %cx, 12(%rsi) +; SSE3-NEXT: movb %al, 18(%rsi) +; SSE3-NEXT: movw %cx, 16(%rsi) +; SSE3-NEXT: movb %al, 22(%rsi) +; SSE3-NEXT: movw %cx, 20(%rsi) +; SSE3-NEXT: movb %al, 26(%rsi) +; SSE3-NEXT: movw %cx, 24(%rsi) +; SSE3-NEXT: movb %al, 30(%rsi) +; SSE3-NEXT: movw %cx, 28(%rsi) +; SSE3-NEXT: movb %al, 34(%rsi) +; SSE3-NEXT: movw %cx, 32(%rsi) +; SSE3-NEXT: movb %al, 38(%rsi) +; SSE3-NEXT: movw %cx, 36(%rsi) +; SSE3-NEXT: movb %al, 42(%rsi) +; SSE3-NEXT: movw %cx, 40(%rsi) +; SSE3-NEXT: movb %al, 46(%rsi) +; SSE3-NEXT: movw %cx, 44(%rsi) +; SSE3-NEXT: movb %al, 50(%rsi) +; SSE3-NEXT: movw %cx, 48(%rsi) +; SSE3-NEXT: movb %al, 54(%rsi) +; SSE3-NEXT: movw %cx, 52(%rsi) +; SSE3-NEXT: movb %al, 58(%rsi) +; SSE3-NEXT: movw %cx, 56(%rsi) +; SSE3-NEXT: movb %al, 62(%rsi) +; SSE3-NEXT: movw %cx, 60(%rsi) +; SSE3-NEXT: retq +; +; SSSE3-ONLY-LABEL: vec384_v3i8: +; SSSE3-ONLY: # %bb.0: +; SSSE3-ONLY-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-ONLY-NEXT: pxor %xmm1, %xmm1 +; SSSE3-ONLY-NEXT: psubb %xmm0, %xmm1 +; SSSE3-ONLY-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-ONLY-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSSE3-ONLY-NEXT: movb %al, 2(%rsi) +; SSSE3-ONLY-NEXT: movd %xmm1, %ecx +; SSSE3-ONLY-NEXT: movw %cx, (%rsi) +; SSSE3-ONLY-NEXT: movb %al, 6(%rsi) +; SSSE3-ONLY-NEXT: movw %cx, 4(%rsi) +; SSSE3-ONLY-NEXT: movb %al, 10(%rsi) +; SSSE3-ONLY-NEXT: movw %cx, 8(%rsi) +; SSSE3-ONLY-NEXT: movb %al, 14(%rsi) +; SSSE3-ONLY-NEXT: movw %cx, 12(%rsi) +; SSSE3-ONLY-NEXT: movb %al, 18(%rsi) +; SSSE3-ONLY-NEXT: movw %cx, 16(%rsi) +; SSSE3-ONLY-NEXT: movb %al, 22(%rsi) +; SSSE3-ONLY-NEXT: movw %cx, 20(%rsi) +; SSSE3-ONLY-NEXT: movb %al, 26(%rsi) +; SSSE3-ONLY-NEXT: movw %cx, 24(%rsi) +; SSSE3-ONLY-NEXT: movb %al, 30(%rsi) +; SSSE3-ONLY-NEXT: movw %cx, 28(%rsi) +; SSSE3-ONLY-NEXT: movb %al, 34(%rsi) +; SSSE3-ONLY-NEXT: movw %cx, 32(%rsi) +; SSSE3-ONLY-NEXT: movb %al, 38(%rsi) +; SSSE3-ONLY-NEXT: movw %cx, 36(%rsi) +; SSSE3-ONLY-NEXT: movb %al, 42(%rsi) +; SSSE3-ONLY-NEXT: movw %cx, 40(%rsi) +; SSSE3-ONLY-NEXT: movb %al, 46(%rsi) +; SSSE3-ONLY-NEXT: movw %cx, 44(%rsi) +; SSSE3-ONLY-NEXT: movb %al, 50(%rsi) +; SSSE3-ONLY-NEXT: movw %cx, 48(%rsi) +; SSSE3-ONLY-NEXT: movb %al, 54(%rsi) +; SSSE3-ONLY-NEXT: movw %cx, 52(%rsi) +; SSSE3-ONLY-NEXT: movb %al, 58(%rsi) +; SSSE3-ONLY-NEXT: movw %cx, 56(%rsi) +; SSSE3-ONLY-NEXT: movb %al, 62(%rsi) +; SSSE3-ONLY-NEXT: movw %cx, 60(%rsi) +; SSSE3-ONLY-NEXT: retq +; +; SSE41-LABEL: vec384_v3i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: psubb %xmm1, %xmm0 +; SSE41-NEXT: pextrb $2, %xmm0, 2(%rsi) +; SSE41-NEXT: movd %xmm0, %eax +; SSE41-NEXT: movw %ax, (%rsi) +; SSE41-NEXT: pextrb $2, %xmm0, 6(%rsi) +; SSE41-NEXT: movw %ax, 4(%rsi) +; SSE41-NEXT: pextrb $2, %xmm0, 10(%rsi) +; SSE41-NEXT: movw %ax, 8(%rsi) +; SSE41-NEXT: pextrb $2, %xmm0, 14(%rsi) +; SSE41-NEXT: movw %ax, 12(%rsi) +; SSE41-NEXT: pextrb $2, %xmm0, 18(%rsi) +; SSE41-NEXT: movw %ax, 16(%rsi) +; SSE41-NEXT: pextrb $2, %xmm0, 22(%rsi) +; SSE41-NEXT: movw %ax, 20(%rsi) +; SSE41-NEXT: pextrb $2, %xmm0, 26(%rsi) +; SSE41-NEXT: movw %ax, 24(%rsi) +; SSE41-NEXT: pextrb $2, %xmm0, 30(%rsi) +; SSE41-NEXT: movw %ax, 28(%rsi) +; SSE41-NEXT: pextrb $2, %xmm0, 34(%rsi) +; SSE41-NEXT: movw %ax, 32(%rsi) +; SSE41-NEXT: pextrb $2, %xmm0, 38(%rsi) +; SSE41-NEXT: movw %ax, 36(%rsi) +; SSE41-NEXT: pextrb $2, %xmm0, 42(%rsi) +; SSE41-NEXT: movw %ax, 40(%rsi) +; SSE41-NEXT: pextrb $2, %xmm0, 46(%rsi) +; SSE41-NEXT: movw %ax, 44(%rsi) +; SSE41-NEXT: pextrb $2, %xmm0, 50(%rsi) +; SSE41-NEXT: movw %ax, 48(%rsi) +; SSE41-NEXT: pextrb $2, %xmm0, 54(%rsi) +; SSE41-NEXT: movw %ax, 52(%rsi) +; SSE41-NEXT: pextrb $2, %xmm0, 58(%rsi) +; SSE41-NEXT: movw %ax, 56(%rsi) +; SSE41-NEXT: pextrb $2, %xmm0, 62(%rsi) +; SSE41-NEXT: movw %ax, 60(%rsi) +; SSE41-NEXT: retq +; +; SSE42-LABEL: vec384_v3i8: +; SSE42: # %bb.0: +; SSE42-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE42-NEXT: pxor %xmm0, %xmm0 +; SSE42-NEXT: psubb %xmm1, %xmm0 +; SSE42-NEXT: pextrb $2, %xmm0, 2(%rsi) +; SSE42-NEXT: movd %xmm0, %eax +; SSE42-NEXT: movw %ax, (%rsi) +; SSE42-NEXT: pextrb $2, %xmm0, 6(%rsi) +; SSE42-NEXT: movw %ax, 4(%rsi) +; SSE42-NEXT: pextrb $2, %xmm0, 10(%rsi) +; SSE42-NEXT: movw %ax, 8(%rsi) +; SSE42-NEXT: pextrb $2, %xmm0, 14(%rsi) +; SSE42-NEXT: movw %ax, 12(%rsi) +; SSE42-NEXT: pextrb $2, %xmm0, 18(%rsi) +; SSE42-NEXT: movw %ax, 16(%rsi) +; SSE42-NEXT: pextrb $2, %xmm0, 22(%rsi) +; SSE42-NEXT: movw %ax, 20(%rsi) +; SSE42-NEXT: pextrb $2, %xmm0, 26(%rsi) +; SSE42-NEXT: movw %ax, 24(%rsi) +; SSE42-NEXT: pextrb $2, %xmm0, 30(%rsi) +; SSE42-NEXT: movw %ax, 28(%rsi) +; SSE42-NEXT: pextrb $2, %xmm0, 34(%rsi) +; SSE42-NEXT: movw %ax, 32(%rsi) +; SSE42-NEXT: pextrb $2, %xmm0, 38(%rsi) +; SSE42-NEXT: movw %ax, 36(%rsi) +; SSE42-NEXT: pextrb $2, %xmm0, 42(%rsi) +; SSE42-NEXT: movw %ax, 40(%rsi) +; SSE42-NEXT: pextrb $2, %xmm0, 46(%rsi) +; SSE42-NEXT: movw %ax, 44(%rsi) +; SSE42-NEXT: pextrb $2, %xmm0, 50(%rsi) +; SSE42-NEXT: movw %ax, 48(%rsi) +; SSE42-NEXT: pextrb $2, %xmm0, 54(%rsi) +; SSE42-NEXT: movw %ax, 52(%rsi) +; SSE42-NEXT: pextrb $2, %xmm0, 58(%rsi) +; SSE42-NEXT: movw %ax, 56(%rsi) +; SSE42-NEXT: pextrb $2, %xmm0, 62(%rsi) +; SSE42-NEXT: movw %ax, 60(%rsi) +; SSE42-NEXT: retq +; +; AVX-LABEL: vec384_v3i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpextrb $2, %xmm0, 2(%rsi) +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: movw %ax, (%rsi) +; AVX-NEXT: vpextrb $2, %xmm0, 6(%rsi) +; AVX-NEXT: movw %ax, 4(%rsi) +; AVX-NEXT: vpextrb $2, %xmm0, 10(%rsi) +; AVX-NEXT: movw %ax, 8(%rsi) +; AVX-NEXT: vpextrb $2, %xmm0, 14(%rsi) +; AVX-NEXT: movw %ax, 12(%rsi) +; AVX-NEXT: vpextrb $2, %xmm0, 18(%rsi) +; AVX-NEXT: movw %ax, 16(%rsi) +; AVX-NEXT: vpextrb $2, %xmm0, 22(%rsi) +; AVX-NEXT: movw %ax, 20(%rsi) +; AVX-NEXT: vpextrb $2, %xmm0, 26(%rsi) +; AVX-NEXT: movw %ax, 24(%rsi) +; AVX-NEXT: vpextrb $2, %xmm0, 30(%rsi) +; AVX-NEXT: movw %ax, 28(%rsi) +; AVX-NEXT: vpextrb $2, %xmm0, 34(%rsi) +; AVX-NEXT: movw %ax, 32(%rsi) +; AVX-NEXT: vpextrb $2, %xmm0, 38(%rsi) +; AVX-NEXT: movw %ax, 36(%rsi) +; AVX-NEXT: vpextrb $2, %xmm0, 42(%rsi) +; AVX-NEXT: movw %ax, 40(%rsi) +; AVX-NEXT: vpextrb $2, %xmm0, 46(%rsi) +; AVX-NEXT: movw %ax, 44(%rsi) +; AVX-NEXT: vpextrb $2, %xmm0, 50(%rsi) +; AVX-NEXT: movw %ax, 48(%rsi) +; AVX-NEXT: vpextrb $2, %xmm0, 54(%rsi) +; AVX-NEXT: movw %ax, 52(%rsi) +; AVX-NEXT: vpextrb $2, %xmm0, 58(%rsi) +; AVX-NEXT: movw %ax, 56(%rsi) +; AVX-NEXT: vpextrb $2, %xmm0, 62(%rsi) +; AVX-NEXT: movw %ax, 60(%rsi) +; AVX-NEXT: retq + %in.subvec.neg = load <3 x i8>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <3 x i8> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 0 + store <3 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 1 + store <3 x i8> %in.subvec, ptr %out.subvec1.ptr, align 1 + %out.subvec2.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 2 + store <3 x i8> %in.subvec, ptr %out.subvec2.ptr, align 2 + %out.subvec3.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 3 + store <3 x i8> %in.subvec, ptr %out.subvec3.ptr, align 1 + %out.subvec4.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 4 + store <3 x i8> %in.subvec, ptr %out.subvec4.ptr, align 4 + %out.subvec5.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 5 + store <3 x i8> %in.subvec, ptr %out.subvec5.ptr, align 1 + %out.subvec6.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 6 + store <3 x i8> %in.subvec, ptr %out.subvec6.ptr, align 2 + %out.subvec7.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 7 + store <3 x i8> %in.subvec, ptr %out.subvec7.ptr, align 1 + %out.subvec8.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 8 + store <3 x i8> %in.subvec, ptr %out.subvec8.ptr, align 8 + %out.subvec9.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 9 + store <3 x i8> %in.subvec, ptr %out.subvec9.ptr, align 1 + %out.subvec10.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 10 + store <3 x i8> %in.subvec, ptr %out.subvec10.ptr, align 2 + %out.subvec11.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 11 + store <3 x i8> %in.subvec, ptr %out.subvec11.ptr, align 1 + %out.subvec12.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 12 + store <3 x i8> %in.subvec, ptr %out.subvec12.ptr, align 4 + %out.subvec13.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 13 + store <3 x i8> %in.subvec, ptr %out.subvec13.ptr, align 1 + %out.subvec14.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 14 + store <3 x i8> %in.subvec, ptr %out.subvec14.ptr, align 2 + %out.subvec15.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 15 + store <3 x i8> %in.subvec, ptr %out.subvec15.ptr, align 1 + ret void +} + +define void @vec384_v3i16(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec384_v3i16: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movq (%rdi), %rax +; SCALAR-NEXT: movq %rax, %rcx +; SCALAR-NEXT: shrq $32, %rcx +; SCALAR-NEXT: movl %eax, %edx +; SCALAR-NEXT: andl $-65536, %edx # imm = 0xFFFF0000 +; SCALAR-NEXT: negl %eax +; SCALAR-NEXT: movzwl %ax, %eax +; SCALAR-NEXT: subl %edx, %eax +; SCALAR-NEXT: negl %ecx +; SCALAR-NEXT: movw %cx, 4(%rsi) +; SCALAR-NEXT: movl %eax, (%rsi) +; SCALAR-NEXT: movw %cx, 12(%rsi) +; SCALAR-NEXT: movl %eax, 8(%rsi) +; SCALAR-NEXT: movw %cx, 20(%rsi) +; SCALAR-NEXT: movl %eax, 16(%rsi) +; SCALAR-NEXT: movw %cx, 28(%rsi) +; SCALAR-NEXT: movl %eax, 24(%rsi) +; SCALAR-NEXT: movw %cx, 36(%rsi) +; SCALAR-NEXT: movl %eax, 32(%rsi) +; SCALAR-NEXT: movw %cx, 44(%rsi) +; SCALAR-NEXT: movl %eax, 40(%rsi) +; SCALAR-NEXT: movw %cx, 52(%rsi) +; SCALAR-NEXT: movl %eax, 48(%rsi) +; SCALAR-NEXT: movw %cx, 60(%rsi) +; SCALAR-NEXT: movl %eax, 56(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-ONLY-LABEL: vec384_v3i16: +; SSE2-ONLY: # %bb.0: +; SSE2-ONLY-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-ONLY-NEXT: pxor %xmm1, %xmm1 +; SSE2-ONLY-NEXT: psubw %xmm0, %xmm1 +; SSE2-ONLY-NEXT: movd %xmm1, (%rsi) +; SSE2-ONLY-NEXT: pextrw $2, %xmm1, %eax +; SSE2-ONLY-NEXT: movw %ax, 4(%rsi) +; SSE2-ONLY-NEXT: movd %xmm1, 8(%rsi) +; SSE2-ONLY-NEXT: movw %ax, 12(%rsi) +; SSE2-ONLY-NEXT: movd %xmm1, 16(%rsi) +; SSE2-ONLY-NEXT: movw %ax, 20(%rsi) +; SSE2-ONLY-NEXT: movd %xmm1, 24(%rsi) +; SSE2-ONLY-NEXT: movw %ax, 28(%rsi) +; SSE2-ONLY-NEXT: movd %xmm1, 32(%rsi) +; SSE2-ONLY-NEXT: movw %ax, 36(%rsi) +; SSE2-ONLY-NEXT: movd %xmm1, 40(%rsi) +; SSE2-ONLY-NEXT: movw %ax, 44(%rsi) +; SSE2-ONLY-NEXT: movd %xmm1, 48(%rsi) +; SSE2-ONLY-NEXT: movw %ax, 52(%rsi) +; SSE2-ONLY-NEXT: movd %xmm1, 56(%rsi) +; SSE2-ONLY-NEXT: movw %ax, 60(%rsi) +; SSE2-ONLY-NEXT: retq +; +; SSE3-LABEL: vec384_v3i16: +; SSE3: # %bb.0: +; SSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: psubw %xmm0, %xmm1 +; SSE3-NEXT: movd %xmm1, (%rsi) +; SSE3-NEXT: pextrw $2, %xmm1, %eax +; SSE3-NEXT: movw %ax, 4(%rsi) +; SSE3-NEXT: movd %xmm1, 8(%rsi) +; SSE3-NEXT: movw %ax, 12(%rsi) +; SSE3-NEXT: movd %xmm1, 16(%rsi) +; SSE3-NEXT: movw %ax, 20(%rsi) +; SSE3-NEXT: movd %xmm1, 24(%rsi) +; SSE3-NEXT: movw %ax, 28(%rsi) +; SSE3-NEXT: movd %xmm1, 32(%rsi) +; SSE3-NEXT: movw %ax, 36(%rsi) +; SSE3-NEXT: movd %xmm1, 40(%rsi) +; SSE3-NEXT: movw %ax, 44(%rsi) +; SSE3-NEXT: movd %xmm1, 48(%rsi) +; SSE3-NEXT: movw %ax, 52(%rsi) +; SSE3-NEXT: movd %xmm1, 56(%rsi) +; SSE3-NEXT: movw %ax, 60(%rsi) +; SSE3-NEXT: retq +; +; SSSE3-ONLY-LABEL: vec384_v3i16: +; SSSE3-ONLY: # %bb.0: +; SSSE3-ONLY-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSSE3-ONLY-NEXT: pxor %xmm1, %xmm1 +; SSSE3-ONLY-NEXT: psubw %xmm0, %xmm1 +; SSSE3-ONLY-NEXT: movd %xmm1, (%rsi) +; SSSE3-ONLY-NEXT: pextrw $2, %xmm1, %eax +; SSSE3-ONLY-NEXT: movw %ax, 4(%rsi) +; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rsi) +; SSSE3-ONLY-NEXT: movw %ax, 12(%rsi) +; SSSE3-ONLY-NEXT: movd %xmm1, 16(%rsi) +; SSSE3-ONLY-NEXT: movw %ax, 20(%rsi) +; SSSE3-ONLY-NEXT: movd %xmm1, 24(%rsi) +; SSSE3-ONLY-NEXT: movw %ax, 28(%rsi) +; SSSE3-ONLY-NEXT: movd %xmm1, 32(%rsi) +; SSSE3-ONLY-NEXT: movw %ax, 36(%rsi) +; SSSE3-ONLY-NEXT: movd %xmm1, 40(%rsi) +; SSSE3-ONLY-NEXT: movw %ax, 44(%rsi) +; SSSE3-ONLY-NEXT: movd %xmm1, 48(%rsi) +; SSSE3-ONLY-NEXT: movw %ax, 52(%rsi) +; SSSE3-ONLY-NEXT: movd %xmm1, 56(%rsi) +; SSSE3-ONLY-NEXT: movw %ax, 60(%rsi) +; SSSE3-ONLY-NEXT: retq +; +; SSE41-LABEL: vec384_v3i16: +; SSE41: # %bb.0: +; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: psubw %xmm0, %xmm1 +; SSE41-NEXT: pextrw $2, %xmm1, 4(%rsi) +; SSE41-NEXT: movd %xmm1, (%rsi) +; SSE41-NEXT: pextrw $2, %xmm1, 12(%rsi) +; SSE41-NEXT: movd %xmm1, 8(%rsi) +; SSE41-NEXT: pextrw $2, %xmm1, 20(%rsi) +; SSE41-NEXT: movd %xmm1, 16(%rsi) +; SSE41-NEXT: pextrw $2, %xmm1, 28(%rsi) +; SSE41-NEXT: movd %xmm1, 24(%rsi) +; SSE41-NEXT: pextrw $2, %xmm1, 36(%rsi) +; SSE41-NEXT: movd %xmm1, 32(%rsi) +; SSE41-NEXT: pextrw $2, %xmm1, 44(%rsi) +; SSE41-NEXT: movd %xmm1, 40(%rsi) +; SSE41-NEXT: pextrw $2, %xmm1, 52(%rsi) +; SSE41-NEXT: movd %xmm1, 48(%rsi) +; SSE41-NEXT: pextrw $2, %xmm1, 60(%rsi) +; SSE41-NEXT: movd %xmm1, 56(%rsi) +; SSE41-NEXT: retq +; +; SSE42-LABEL: vec384_v3i16: +; SSE42: # %bb.0: +; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: psubw %xmm0, %xmm1 +; SSE42-NEXT: pextrw $2, %xmm1, 4(%rsi) +; SSE42-NEXT: movd %xmm1, (%rsi) +; SSE42-NEXT: pextrw $2, %xmm1, 12(%rsi) +; SSE42-NEXT: movd %xmm1, 8(%rsi) +; SSE42-NEXT: pextrw $2, %xmm1, 20(%rsi) +; SSE42-NEXT: movd %xmm1, 16(%rsi) +; SSE42-NEXT: pextrw $2, %xmm1, 28(%rsi) +; SSE42-NEXT: movd %xmm1, 24(%rsi) +; SSE42-NEXT: pextrw $2, %xmm1, 36(%rsi) +; SSE42-NEXT: movd %xmm1, 32(%rsi) +; SSE42-NEXT: pextrw $2, %xmm1, 44(%rsi) +; SSE42-NEXT: movd %xmm1, 40(%rsi) +; SSE42-NEXT: pextrw $2, %xmm1, 52(%rsi) +; SSE42-NEXT: movd %xmm1, 48(%rsi) +; SSE42-NEXT: pextrw $2, %xmm1, 60(%rsi) +; SSE42-NEXT: movd %xmm1, 56(%rsi) +; SSE42-NEXT: retq +; +; AVX-LABEL: vec384_v3i16: +; AVX: # %bb.0: +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpextrw $2, %xmm0, 4(%rsi) +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: vpextrw $2, %xmm0, 12(%rsi) +; AVX-NEXT: vmovd %xmm0, 8(%rsi) +; AVX-NEXT: vpextrw $2, %xmm0, 20(%rsi) +; AVX-NEXT: vmovd %xmm0, 16(%rsi) +; AVX-NEXT: vpextrw $2, %xmm0, 28(%rsi) +; AVX-NEXT: vmovd %xmm0, 24(%rsi) +; AVX-NEXT: vpextrw $2, %xmm0, 36(%rsi) +; AVX-NEXT: vmovd %xmm0, 32(%rsi) +; AVX-NEXT: vpextrw $2, %xmm0, 44(%rsi) +; AVX-NEXT: vmovd %xmm0, 40(%rsi) +; AVX-NEXT: vpextrw $2, %xmm0, 52(%rsi) +; AVX-NEXT: vmovd %xmm0, 48(%rsi) +; AVX-NEXT: vpextrw $2, %xmm0, 60(%rsi) +; AVX-NEXT: vmovd %xmm0, 56(%rsi) +; AVX-NEXT: retq + %in.subvec.neg = load <3 x i16>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <3 x i16> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 0 + store <3 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 1 + store <3 x i16> %in.subvec, ptr %out.subvec1.ptr, align 2 + %out.subvec2.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 2 + store <3 x i16> %in.subvec, ptr %out.subvec2.ptr, align 4 + %out.subvec3.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 3 + store <3 x i16> %in.subvec, ptr %out.subvec3.ptr, align 2 + %out.subvec4.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 4 + store <3 x i16> %in.subvec, ptr %out.subvec4.ptr, align 8 + %out.subvec5.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 5 + store <3 x i16> %in.subvec, ptr %out.subvec5.ptr, align 2 + %out.subvec6.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 6 + store <3 x i16> %in.subvec, ptr %out.subvec6.ptr, align 4 + %out.subvec7.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 7 + store <3 x i16> %in.subvec, ptr %out.subvec7.ptr, align 2 + ret void +} + +define void @vec384_v3i32(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec384_v3i32: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movq (%rdi), %rax +; SCALAR-NEXT: movq %rax, %rcx +; SCALAR-NEXT: shrq $32, %rcx +; SCALAR-NEXT: negl %ecx +; SCALAR-NEXT: shlq $32, %rcx +; SCALAR-NEXT: negl %eax +; SCALAR-NEXT: orq %rcx, %rax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subl 8(%rdi), %ecx +; SCALAR-NEXT: movl %ecx, 8(%rsi) +; SCALAR-NEXT: movq %rax, (%rsi) +; SCALAR-NEXT: movl %ecx, 24(%rsi) +; SCALAR-NEXT: movq %rax, 16(%rsi) +; SCALAR-NEXT: movl %ecx, 40(%rsi) +; SCALAR-NEXT: movq %rax, 32(%rsi) +; SCALAR-NEXT: movl %ecx, 56(%rsi) +; SCALAR-NEXT: movq %rax, 48(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-ONLY-LABEL: vec384_v3i32: +; SSE2-ONLY: # %bb.0: +; SSE2-ONLY-NEXT: pxor %xmm0, %xmm0 +; SSE2-ONLY-NEXT: psubd (%rdi), %xmm0 +; SSE2-ONLY-NEXT: movq %xmm0, (%rsi) +; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE2-ONLY-NEXT: movd %xmm1, 8(%rsi) +; SSE2-ONLY-NEXT: movq %xmm0, 16(%rsi) +; SSE2-ONLY-NEXT: movd %xmm1, 24(%rsi) +; SSE2-ONLY-NEXT: movq %xmm0, 32(%rsi) +; SSE2-ONLY-NEXT: movd %xmm1, 40(%rsi) +; SSE2-ONLY-NEXT: movq %xmm0, 48(%rsi) +; SSE2-ONLY-NEXT: movd %xmm1, 56(%rsi) +; SSE2-ONLY-NEXT: retq +; +; SSE3-LABEL: vec384_v3i32: +; SSE3: # %bb.0: +; SSE3-NEXT: pxor %xmm0, %xmm0 +; SSE3-NEXT: psubd (%rdi), %xmm0 +; SSE3-NEXT: movq %xmm0, (%rsi) +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE3-NEXT: movd %xmm1, 8(%rsi) +; SSE3-NEXT: movq %xmm0, 16(%rsi) +; SSE3-NEXT: movd %xmm1, 24(%rsi) +; SSE3-NEXT: movq %xmm0, 32(%rsi) +; SSE3-NEXT: movd %xmm1, 40(%rsi) +; SSE3-NEXT: movq %xmm0, 48(%rsi) +; SSE3-NEXT: movd %xmm1, 56(%rsi) +; SSE3-NEXT: retq +; +; SSSE3-ONLY-LABEL: vec384_v3i32: +; SSSE3-ONLY: # %bb.0: +; SSSE3-ONLY-NEXT: pxor %xmm0, %xmm0 +; SSSE3-ONLY-NEXT: psubd (%rdi), %xmm0 +; SSSE3-ONLY-NEXT: movq %xmm0, (%rsi) +; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rsi) +; SSSE3-ONLY-NEXT: movq %xmm0, 16(%rsi) +; SSSE3-ONLY-NEXT: movd %xmm1, 24(%rsi) +; SSSE3-ONLY-NEXT: movq %xmm0, 32(%rsi) +; SSSE3-ONLY-NEXT: movd %xmm1, 40(%rsi) +; SSSE3-ONLY-NEXT: movq %xmm0, 48(%rsi) +; SSSE3-ONLY-NEXT: movd %xmm1, 56(%rsi) +; SSSE3-ONLY-NEXT: retq +; +; SSE41-LABEL: vec384_v3i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: psubd (%rdi), %xmm0 +; SSE41-NEXT: pextrd $2, %xmm0, 8(%rsi) +; SSE41-NEXT: movq %xmm0, (%rsi) +; SSE41-NEXT: pextrd $2, %xmm0, 24(%rsi) +; SSE41-NEXT: movq %xmm0, 16(%rsi) +; SSE41-NEXT: pextrd $2, %xmm0, 40(%rsi) +; SSE41-NEXT: movq %xmm0, 32(%rsi) +; SSE41-NEXT: pextrd $2, %xmm0, 56(%rsi) +; SSE41-NEXT: movq %xmm0, 48(%rsi) +; SSE41-NEXT: retq +; +; SSE42-LABEL: vec384_v3i32: +; SSE42: # %bb.0: +; SSE42-NEXT: pxor %xmm0, %xmm0 +; SSE42-NEXT: psubd (%rdi), %xmm0 +; SSE42-NEXT: pextrd $2, %xmm0, 8(%rsi) +; SSE42-NEXT: movq %xmm0, (%rsi) +; SSE42-NEXT: pextrd $2, %xmm0, 24(%rsi) +; SSE42-NEXT: movq %xmm0, 16(%rsi) +; SSE42-NEXT: pextrd $2, %xmm0, 40(%rsi) +; SSE42-NEXT: movq %xmm0, 32(%rsi) +; SSE42-NEXT: pextrd $2, %xmm0, 56(%rsi) +; SSE42-NEXT: movq %xmm0, 48(%rsi) +; SSE42-NEXT: retq +; +; AVX-LABEL: vec384_v3i32: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsubd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpextrd $2, %xmm0, 8(%rsi) +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: vpextrd $2, %xmm0, 24(%rsi) +; AVX-NEXT: vmovq %xmm0, 16(%rsi) +; AVX-NEXT: vpextrd $2, %xmm0, 40(%rsi) +; AVX-NEXT: vmovq %xmm0, 32(%rsi) +; AVX-NEXT: vpextrd $2, %xmm0, 56(%rsi) +; AVX-NEXT: vmovq %xmm0, 48(%rsi) +; AVX-NEXT: retq + %in.subvec.neg = load <3 x i32>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <3 x i32> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <3 x i32>, ptr %out.vec.ptr, i64 0 + store <3 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <3 x i32>, ptr %out.vec.ptr, i64 1 + store <3 x i32> %in.subvec, ptr %out.subvec1.ptr, align 4 + %out.subvec2.ptr = getelementptr <3 x i32>, ptr %out.vec.ptr, i64 2 + store <3 x i32> %in.subvec, ptr %out.subvec2.ptr, align 8 + %out.subvec3.ptr = getelementptr <3 x i32>, ptr %out.vec.ptr, i64 3 + store <3 x i32> %in.subvec, ptr %out.subvec3.ptr, align 4 + ret void +} + +define void @vec384_v3f32(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec384_v3f32: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movq (%rdi), %rax +; SCALAR-NEXT: movq %rax, %rcx +; SCALAR-NEXT: shrq $32, %rcx +; SCALAR-NEXT: negl %ecx +; SCALAR-NEXT: shlq $32, %rcx +; SCALAR-NEXT: negl %eax +; SCALAR-NEXT: orq %rcx, %rax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subl 8(%rdi), %ecx +; SCALAR-NEXT: movl %ecx, 8(%rsi) +; SCALAR-NEXT: movq %rax, (%rsi) +; SCALAR-NEXT: movl %ecx, 24(%rsi) +; SCALAR-NEXT: movq %rax, 16(%rsi) +; SCALAR-NEXT: movl %ecx, 40(%rsi) +; SCALAR-NEXT: movq %rax, 32(%rsi) +; SCALAR-NEXT: movl %ecx, 56(%rsi) +; SCALAR-NEXT: movq %rax, 48(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-ONLY-LABEL: vec384_v3f32: +; SSE2-ONLY: # %bb.0: +; SSE2-ONLY-NEXT: pxor %xmm0, %xmm0 +; SSE2-ONLY-NEXT: psubd (%rdi), %xmm0 +; SSE2-ONLY-NEXT: movq %xmm0, (%rsi) +; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE2-ONLY-NEXT: movd %xmm1, 8(%rsi) +; SSE2-ONLY-NEXT: movq %xmm0, 16(%rsi) +; SSE2-ONLY-NEXT: movd %xmm1, 24(%rsi) +; SSE2-ONLY-NEXT: movq %xmm0, 32(%rsi) +; SSE2-ONLY-NEXT: movd %xmm1, 40(%rsi) +; SSE2-ONLY-NEXT: movq %xmm0, 48(%rsi) +; SSE2-ONLY-NEXT: movd %xmm1, 56(%rsi) +; SSE2-ONLY-NEXT: retq +; +; SSE3-LABEL: vec384_v3f32: +; SSE3: # %bb.0: +; SSE3-NEXT: pxor %xmm0, %xmm0 +; SSE3-NEXT: psubd (%rdi), %xmm0 +; SSE3-NEXT: movq %xmm0, (%rsi) +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE3-NEXT: movd %xmm1, 8(%rsi) +; SSE3-NEXT: movq %xmm0, 16(%rsi) +; SSE3-NEXT: movd %xmm1, 24(%rsi) +; SSE3-NEXT: movq %xmm0, 32(%rsi) +; SSE3-NEXT: movd %xmm1, 40(%rsi) +; SSE3-NEXT: movq %xmm0, 48(%rsi) +; SSE3-NEXT: movd %xmm1, 56(%rsi) +; SSE3-NEXT: retq +; +; SSSE3-ONLY-LABEL: vec384_v3f32: +; SSSE3-ONLY: # %bb.0: +; SSSE3-ONLY-NEXT: pxor %xmm0, %xmm0 +; SSSE3-ONLY-NEXT: psubd (%rdi), %xmm0 +; SSSE3-ONLY-NEXT: movq %xmm0, (%rsi) +; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rsi) +; SSSE3-ONLY-NEXT: movq %xmm0, 16(%rsi) +; SSSE3-ONLY-NEXT: movd %xmm1, 24(%rsi) +; SSSE3-ONLY-NEXT: movq %xmm0, 32(%rsi) +; SSSE3-ONLY-NEXT: movd %xmm1, 40(%rsi) +; SSSE3-ONLY-NEXT: movq %xmm0, 48(%rsi) +; SSSE3-ONLY-NEXT: movd %xmm1, 56(%rsi) +; SSSE3-ONLY-NEXT: retq +; +; SSE41-LABEL: vec384_v3f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: psubd (%rdi), %xmm0 +; SSE41-NEXT: pextrd $2, %xmm0, 8(%rsi) +; SSE41-NEXT: movq %xmm0, (%rsi) +; SSE41-NEXT: pextrd $2, %xmm0, 24(%rsi) +; SSE41-NEXT: movq %xmm0, 16(%rsi) +; SSE41-NEXT: pextrd $2, %xmm0, 40(%rsi) +; SSE41-NEXT: movq %xmm0, 32(%rsi) +; SSE41-NEXT: pextrd $2, %xmm0, 56(%rsi) +; SSE41-NEXT: movq %xmm0, 48(%rsi) +; SSE41-NEXT: retq +; +; SSE42-LABEL: vec384_v3f32: +; SSE42: # %bb.0: +; SSE42-NEXT: pxor %xmm0, %xmm0 +; SSE42-NEXT: psubd (%rdi), %xmm0 +; SSE42-NEXT: pextrd $2, %xmm0, 8(%rsi) +; SSE42-NEXT: movq %xmm0, (%rsi) +; SSE42-NEXT: pextrd $2, %xmm0, 24(%rsi) +; SSE42-NEXT: movq %xmm0, 16(%rsi) +; SSE42-NEXT: pextrd $2, %xmm0, 40(%rsi) +; SSE42-NEXT: movq %xmm0, 32(%rsi) +; SSE42-NEXT: pextrd $2, %xmm0, 56(%rsi) +; SSE42-NEXT: movq %xmm0, 48(%rsi) +; SSE42-NEXT: retq +; +; AVX-LABEL: vec384_v3f32: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsubd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpextrd $2, %xmm0, 8(%rsi) +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: vpextrd $2, %xmm0, 24(%rsi) +; AVX-NEXT: vmovq %xmm0, 16(%rsi) +; AVX-NEXT: vpextrd $2, %xmm0, 40(%rsi) +; AVX-NEXT: vmovq %xmm0, 32(%rsi) +; AVX-NEXT: vpextrd $2, %xmm0, 56(%rsi) +; AVX-NEXT: vmovq %xmm0, 48(%rsi) +; AVX-NEXT: retq + %in.subvec.neg = load <3 x i32>, ptr %in.subvec.ptr, align 64 + %in.subvec.int = sub <3 x i32> zeroinitializer, %in.subvec.neg + %in.subvec = bitcast <3 x i32> %in.subvec.int to <3 x float> + %out.subvec0.ptr = getelementptr <3 x float>, ptr %out.vec.ptr, i64 0 + store <3 x float> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <3 x float>, ptr %out.vec.ptr, i64 1 + store <3 x float> %in.subvec, ptr %out.subvec1.ptr, align 4 + %out.subvec2.ptr = getelementptr <3 x float>, ptr %out.vec.ptr, i64 2 + store <3 x float> %in.subvec, ptr %out.subvec2.ptr, align 8 + %out.subvec3.ptr = getelementptr <3 x float>, ptr %out.vec.ptr, i64 3 + store <3 x float> %in.subvec, ptr %out.subvec3.ptr, align 4 + ret void +} + +define void @vec384_v3i64(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec384_v3i64: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subq 16(%rdi), %rcx +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: subq 8(%rdi), %rdx +; SCALAR-NEXT: subq (%rdi), %rax +; SCALAR-NEXT: movq %rax, (%rsi) +; SCALAR-NEXT: movq %rdx, 8(%rsi) +; SCALAR-NEXT: movq %rcx, 16(%rsi) +; SCALAR-NEXT: movq %rcx, 48(%rsi) +; SCALAR-NEXT: movq %rdx, 40(%rsi) +; SCALAR-NEXT: movq %rax, 32(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec384_v3i64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psubq (%rdi), %xmm0 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: subq 16(%rdi), %rax +; SSE2-NEXT: movq %rax, 16(%rsi) +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movq %rax, 48(%rsi) +; SSE2-NEXT: movdqu %xmm0, 32(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec384_v3i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: xorl %eax, %eax +; AVX1-NEXT: subq 16(%rdi), %rax +; AVX1-NEXT: movq %rax, 16(%rsi) +; AVX1-NEXT: movq %rax, 48(%rsi) +; AVX1-NEXT: vmovdqu %xmm0, 32(%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec384_v3i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovq %xmm1, 16(%rsi) +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: vmovq %xmm1, 48(%rsi) +; AVX2-NEXT: vmovdqu %xmm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %in.subvec.neg = load <3 x i64>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <3 x i64> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <3 x i64>, ptr %out.vec.ptr, i64 0 + store <3 x i64> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <3 x i64>, ptr %out.vec.ptr, i64 1 + store <3 x i64> %in.subvec, ptr %out.subvec1.ptr, align 8 + ret void +} + +define void @vec384_v3f64(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec384_v3f64: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subq 16(%rdi), %rcx +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: subq 8(%rdi), %rdx +; SCALAR-NEXT: subq (%rdi), %rax +; SCALAR-NEXT: movq %rax, (%rsi) +; SCALAR-NEXT: movq %rdx, 8(%rsi) +; SCALAR-NEXT: movq %rcx, 16(%rsi) +; SCALAR-NEXT: movq %rcx, 48(%rsi) +; SCALAR-NEXT: movq %rdx, 40(%rsi) +; SCALAR-NEXT: movq %rax, 32(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec384_v3f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psubq (%rdi), %xmm0 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: subq 16(%rdi), %rax +; SSE2-NEXT: movq %rax, 16(%rsi) +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movq %rax, 48(%rsi) +; SSE2-NEXT: movdqu %xmm0, 32(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec384_v3f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: xorl %eax, %eax +; AVX1-NEXT: subq 16(%rdi), %rax +; AVX1-NEXT: movq %rax, 16(%rsi) +; AVX1-NEXT: movq %rax, 48(%rsi) +; AVX1-NEXT: vmovdqu %xmm0, 32(%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec384_v3f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovq %xmm1, 16(%rsi) +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: vmovq %xmm1, 48(%rsi) +; AVX2-NEXT: vmovdqu %xmm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %in.subvec.neg = load <3 x i64>, ptr %in.subvec.ptr, align 64 + %in.subvec.int = sub <3 x i64> zeroinitializer, %in.subvec.neg + %in.subvec = bitcast <3 x i64> %in.subvec.int to <3 x double> + %out.subvec0.ptr = getelementptr <3 x double>, ptr %out.vec.ptr, i64 0 + store <3 x double> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <3 x double>, ptr %out.vec.ptr, i64 1 + store <3 x double> %in.subvec, ptr %out.subvec1.ptr, align 8 + ret void +} + +define void @vec384_v4i8(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec384_v4i8: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb (%rdi), %al +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: subb 1(%rdi), %dl +; SCALAR-NEXT: xorl %r8d, %r8d +; SCALAR-NEXT: subb 2(%rdi), %r8b +; SCALAR-NEXT: subb 3(%rdi), %cl +; SCALAR-NEXT: movb %cl, 3(%rsi) +; SCALAR-NEXT: movb %r8b, 2(%rsi) +; SCALAR-NEXT: movb %dl, 1(%rsi) +; SCALAR-NEXT: movb %al, (%rsi) +; SCALAR-NEXT: movb %cl, 7(%rsi) +; SCALAR-NEXT: movb %r8b, 6(%rsi) +; SCALAR-NEXT: movb %dl, 5(%rsi) +; SCALAR-NEXT: movb %al, 4(%rsi) +; SCALAR-NEXT: movb %cl, 11(%rsi) +; SCALAR-NEXT: movb %r8b, 10(%rsi) +; SCALAR-NEXT: movb %dl, 9(%rsi) +; SCALAR-NEXT: movb %al, 8(%rsi) +; SCALAR-NEXT: movb %cl, 15(%rsi) +; SCALAR-NEXT: movb %r8b, 14(%rsi) +; SCALAR-NEXT: movb %dl, 13(%rsi) +; SCALAR-NEXT: movb %al, 12(%rsi) +; SCALAR-NEXT: movb %cl, 19(%rsi) +; SCALAR-NEXT: movb %r8b, 18(%rsi) +; SCALAR-NEXT: movb %dl, 17(%rsi) +; SCALAR-NEXT: movb %al, 16(%rsi) +; SCALAR-NEXT: movb %cl, 23(%rsi) +; SCALAR-NEXT: movb %r8b, 22(%rsi) +; SCALAR-NEXT: movb %dl, 21(%rsi) +; SCALAR-NEXT: movb %al, 20(%rsi) +; SCALAR-NEXT: movb %cl, 27(%rsi) +; SCALAR-NEXT: movb %r8b, 26(%rsi) +; SCALAR-NEXT: movb %dl, 25(%rsi) +; SCALAR-NEXT: movb %al, 24(%rsi) +; SCALAR-NEXT: movb %cl, 31(%rsi) +; SCALAR-NEXT: movb %r8b, 30(%rsi) +; SCALAR-NEXT: movb %dl, 29(%rsi) +; SCALAR-NEXT: movb %al, 28(%rsi) +; SCALAR-NEXT: movb %cl, 35(%rsi) +; SCALAR-NEXT: movb %r8b, 34(%rsi) +; SCALAR-NEXT: movb %dl, 33(%rsi) +; SCALAR-NEXT: movb %al, 32(%rsi) +; SCALAR-NEXT: movb %cl, 39(%rsi) +; SCALAR-NEXT: movb %r8b, 38(%rsi) +; SCALAR-NEXT: movb %dl, 37(%rsi) +; SCALAR-NEXT: movb %al, 36(%rsi) +; SCALAR-NEXT: movb %cl, 43(%rsi) +; SCALAR-NEXT: movb %r8b, 42(%rsi) +; SCALAR-NEXT: movb %dl, 41(%rsi) +; SCALAR-NEXT: movb %al, 40(%rsi) +; SCALAR-NEXT: movb %cl, 47(%rsi) +; SCALAR-NEXT: movb %r8b, 46(%rsi) +; SCALAR-NEXT: movb %dl, 45(%rsi) +; SCALAR-NEXT: movb %al, 44(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec384_v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,0,0] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm1, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, 32(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec384_v4i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,0,0,0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 +; AVX1-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec384_v4i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %in.subvec.neg = load <4 x i8>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <4 x i8> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 0 + store <4 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 1 + store <4 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4 + %out.subvec2.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 2 + store <4 x i8> %in.subvec, ptr %out.subvec2.ptr, align 8 + %out.subvec3.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 3 + store <4 x i8> %in.subvec, ptr %out.subvec3.ptr, align 4 + %out.subvec4.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 4 + store <4 x i8> %in.subvec, ptr %out.subvec4.ptr, align 16 + %out.subvec5.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 5 + store <4 x i8> %in.subvec, ptr %out.subvec5.ptr, align 4 + %out.subvec6.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 6 + store <4 x i8> %in.subvec, ptr %out.subvec6.ptr, align 8 + %out.subvec7.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 7 + store <4 x i8> %in.subvec, ptr %out.subvec7.ptr, align 4 + %out.subvec8.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 8 + store <4 x i8> %in.subvec, ptr %out.subvec8.ptr, align 32 + %out.subvec9.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 9 + store <4 x i8> %in.subvec, ptr %out.subvec9.ptr, align 4 + %out.subvec10.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 10 + store <4 x i8> %in.subvec, ptr %out.subvec10.ptr, align 8 + %out.subvec11.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 11 + store <4 x i8> %in.subvec, ptr %out.subvec11.ptr, align 4 + ret void +} + +define void @vec384_v4i16(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec384_v4i16: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subw (%rdi), %ax +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: subw 2(%rdi), %dx +; SCALAR-NEXT: xorl %r8d, %r8d +; SCALAR-NEXT: subw 4(%rdi), %r8w +; SCALAR-NEXT: subw 6(%rdi), %cx +; SCALAR-NEXT: movw %cx, 6(%rsi) +; SCALAR-NEXT: movw %r8w, 4(%rsi) +; SCALAR-NEXT: movw %dx, 2(%rsi) +; SCALAR-NEXT: movw %ax, (%rsi) +; SCALAR-NEXT: movw %cx, 14(%rsi) +; SCALAR-NEXT: movw %r8w, 12(%rsi) +; SCALAR-NEXT: movw %dx, 10(%rsi) +; SCALAR-NEXT: movw %ax, 8(%rsi) +; SCALAR-NEXT: movw %cx, 22(%rsi) +; SCALAR-NEXT: movw %r8w, 20(%rsi) +; SCALAR-NEXT: movw %dx, 18(%rsi) +; SCALAR-NEXT: movw %ax, 16(%rsi) +; SCALAR-NEXT: movw %cx, 30(%rsi) +; SCALAR-NEXT: movw %r8w, 28(%rsi) +; SCALAR-NEXT: movw %dx, 26(%rsi) +; SCALAR-NEXT: movw %ax, 24(%rsi) +; SCALAR-NEXT: movw %cx, 38(%rsi) +; SCALAR-NEXT: movw %r8w, 36(%rsi) +; SCALAR-NEXT: movw %dx, 34(%rsi) +; SCALAR-NEXT: movw %ax, 32(%rsi) +; SCALAR-NEXT: movw %cx, 46(%rsi) +; SCALAR-NEXT: movw %r8w, 44(%rsi) +; SCALAR-NEXT: movw %dx, 42(%rsi) +; SCALAR-NEXT: movw %ax, 40(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec384_v4i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm1, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, 32(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec384_v4i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 +; AVX1-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec384_v4i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %in.subvec.neg = load <4 x i16>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <4 x i16> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 0 + store <4 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 1 + store <4 x i16> %in.subvec, ptr %out.subvec1.ptr, align 8 + %out.subvec2.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 2 + store <4 x i16> %in.subvec, ptr %out.subvec2.ptr, align 16 + %out.subvec3.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 3 + store <4 x i16> %in.subvec, ptr %out.subvec3.ptr, align 8 + %out.subvec4.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 4 + store <4 x i16> %in.subvec, ptr %out.subvec4.ptr, align 32 + %out.subvec5.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 5 + store <4 x i16> %in.subvec, ptr %out.subvec5.ptr, align 8 + ret void +} + +define void @vec384_v4i32(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec384_v4i32: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subl (%rdi), %ecx +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: subl 4(%rdi), %edx +; SCALAR-NEXT: xorl %r8d, %r8d +; SCALAR-NEXT: subl 8(%rdi), %r8d +; SCALAR-NEXT: subl 12(%rdi), %eax +; SCALAR-NEXT: movl %eax, 12(%rsi) +; SCALAR-NEXT: movl %r8d, 8(%rsi) +; SCALAR-NEXT: movl %edx, 4(%rsi) +; SCALAR-NEXT: movl %ecx, (%rsi) +; SCALAR-NEXT: movl %eax, 28(%rsi) +; SCALAR-NEXT: movl %r8d, 24(%rsi) +; SCALAR-NEXT: movl %edx, 20(%rsi) +; SCALAR-NEXT: movl %ecx, 16(%rsi) +; SCALAR-NEXT: movl %eax, 44(%rsi) +; SCALAR-NEXT: movl %r8d, 40(%rsi) +; SCALAR-NEXT: movl %edx, 36(%rsi) +; SCALAR-NEXT: movl %ecx, 32(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec384_v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psubd (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: movdqa %xmm0, 32(%rsi) +; SSE2-NEXT: retq +; +; AVX-LABEL: vec384_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsubd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX-NEXT: retq + %in.subvec.neg = load <4 x i32>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <4 x i32> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 0 + store <4 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 1 + store <4 x i32> %in.subvec, ptr %out.subvec1.ptr, align 16 + %out.subvec2.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 2 + store <4 x i32> %in.subvec, ptr %out.subvec2.ptr, align 32 + ret void +} + +define void @vec384_v4f32(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec384_v4f32: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subl (%rdi), %ecx +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: subl 4(%rdi), %edx +; SCALAR-NEXT: xorl %r8d, %r8d +; SCALAR-NEXT: subl 8(%rdi), %r8d +; SCALAR-NEXT: subl 12(%rdi), %eax +; SCALAR-NEXT: movl %eax, 12(%rsi) +; SCALAR-NEXT: movl %r8d, 8(%rsi) +; SCALAR-NEXT: movl %edx, 4(%rsi) +; SCALAR-NEXT: movl %ecx, (%rsi) +; SCALAR-NEXT: movl %eax, 28(%rsi) +; SCALAR-NEXT: movl %r8d, 24(%rsi) +; SCALAR-NEXT: movl %edx, 20(%rsi) +; SCALAR-NEXT: movl %ecx, 16(%rsi) +; SCALAR-NEXT: movl %eax, 44(%rsi) +; SCALAR-NEXT: movl %r8d, 40(%rsi) +; SCALAR-NEXT: movl %edx, 36(%rsi) +; SCALAR-NEXT: movl %ecx, 32(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec384_v4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psubd (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: movdqa %xmm0, 32(%rsi) +; SSE2-NEXT: retq +; +; AVX-LABEL: vec384_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsubd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX-NEXT: retq + %in.subvec.neg = load <4 x i32>, ptr %in.subvec.ptr, align 64 + %in.subvec.int = sub <4 x i32> zeroinitializer, %in.subvec.neg + %in.subvec = bitcast <4 x i32> %in.subvec.int to <4 x float> + %out.subvec0.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 0 + store <4 x float> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 1 + store <4 x float> %in.subvec, ptr %out.subvec1.ptr, align 16 + %out.subvec2.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 2 + store <4 x float> %in.subvec, ptr %out.subvec2.ptr, align 32 + ret void +} + +define void @vec384_v6i8(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec384_v6i8: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movq (%rdi), %rdx +; SCALAR-NEXT: movq %rdx, %rax +; SCALAR-NEXT: shrq $40, %rax +; SCALAR-NEXT: movq %rdx, %rcx +; SCALAR-NEXT: shrq $32, %rcx +; SCALAR-NEXT: movl %edx, %edi +; SCALAR-NEXT: shrl $24, %edi +; SCALAR-NEXT: movl %edx, %r8d +; SCALAR-NEXT: shrl $16, %r8d +; SCALAR-NEXT: movl %edx, %r9d +; SCALAR-NEXT: shrl $8, %r9d +; SCALAR-NEXT: negb %dl +; SCALAR-NEXT: movzbl %dl, %edx +; SCALAR-NEXT: negb %r9b +; SCALAR-NEXT: movzbl %r9b, %r9d +; SCALAR-NEXT: shll $8, %r9d +; SCALAR-NEXT: orl %edx, %r9d +; SCALAR-NEXT: negb %r8b +; SCALAR-NEXT: movzbl %r8b, %edx +; SCALAR-NEXT: negb %dil +; SCALAR-NEXT: movzbl %dil, %edi +; SCALAR-NEXT: shll $8, %edi +; SCALAR-NEXT: orl %edx, %edi +; SCALAR-NEXT: negb %cl +; SCALAR-NEXT: movzbl %cl, %ecx +; SCALAR-NEXT: negb %al +; SCALAR-NEXT: movzbl %al, %eax +; SCALAR-NEXT: shll $8, %eax +; SCALAR-NEXT: orl %ecx, %eax +; SCALAR-NEXT: movw %ax, 4(%rsi) +; SCALAR-NEXT: shll $16, %edi +; SCALAR-NEXT: movzwl %r9w, %ecx +; SCALAR-NEXT: orl %edi, %ecx +; SCALAR-NEXT: movl %ecx, (%rsi) +; SCALAR-NEXT: movw %ax, 12(%rsi) +; SCALAR-NEXT: movl %ecx, 8(%rsi) +; SCALAR-NEXT: movw %ax, 20(%rsi) +; SCALAR-NEXT: movl %ecx, 16(%rsi) +; SCALAR-NEXT: movw %ax, 28(%rsi) +; SCALAR-NEXT: movl %ecx, 24(%rsi) +; SCALAR-NEXT: movw %ax, 36(%rsi) +; SCALAR-NEXT: movl %ecx, 32(%rsi) +; SCALAR-NEXT: movw %ax, 44(%rsi) +; SCALAR-NEXT: movl %ecx, 40(%rsi) +; SCALAR-NEXT: movw %ax, 52(%rsi) +; SCALAR-NEXT: movl %ecx, 48(%rsi) +; SCALAR-NEXT: movw %ax, 60(%rsi) +; SCALAR-NEXT: movl %ecx, 56(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-ONLY-LABEL: vec384_v6i8: +; SSE2-ONLY: # %bb.0: +; SSE2-ONLY-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-ONLY-NEXT: pxor %xmm1, %xmm1 +; SSE2-ONLY-NEXT: psubb %xmm0, %xmm1 +; SSE2-ONLY-NEXT: movd %xmm1, (%rsi) +; SSE2-ONLY-NEXT: pextrw $2, %xmm1, %eax +; SSE2-ONLY-NEXT: movw %ax, 4(%rsi) +; SSE2-ONLY-NEXT: movd %xmm1, 8(%rsi) +; SSE2-ONLY-NEXT: movw %ax, 12(%rsi) +; SSE2-ONLY-NEXT: movd %xmm1, 16(%rsi) +; SSE2-ONLY-NEXT: movw %ax, 20(%rsi) +; SSE2-ONLY-NEXT: movd %xmm1, 24(%rsi) +; SSE2-ONLY-NEXT: movw %ax, 28(%rsi) +; SSE2-ONLY-NEXT: movd %xmm1, 32(%rsi) +; SSE2-ONLY-NEXT: movw %ax, 36(%rsi) +; SSE2-ONLY-NEXT: movd %xmm1, 40(%rsi) +; SSE2-ONLY-NEXT: movw %ax, 44(%rsi) +; SSE2-ONLY-NEXT: movd %xmm1, 48(%rsi) +; SSE2-ONLY-NEXT: movw %ax, 52(%rsi) +; SSE2-ONLY-NEXT: movd %xmm1, 56(%rsi) +; SSE2-ONLY-NEXT: movw %ax, 60(%rsi) +; SSE2-ONLY-NEXT: retq +; +; SSE3-LABEL: vec384_v6i8: +; SSE3: # %bb.0: +; SSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: psubb %xmm0, %xmm1 +; SSE3-NEXT: movd %xmm1, (%rsi) +; SSE3-NEXT: pextrw $2, %xmm1, %eax +; SSE3-NEXT: movw %ax, 4(%rsi) +; SSE3-NEXT: movd %xmm1, 8(%rsi) +; SSE3-NEXT: movw %ax, 12(%rsi) +; SSE3-NEXT: movd %xmm1, 16(%rsi) +; SSE3-NEXT: movw %ax, 20(%rsi) +; SSE3-NEXT: movd %xmm1, 24(%rsi) +; SSE3-NEXT: movw %ax, 28(%rsi) +; SSE3-NEXT: movd %xmm1, 32(%rsi) +; SSE3-NEXT: movw %ax, 36(%rsi) +; SSE3-NEXT: movd %xmm1, 40(%rsi) +; SSE3-NEXT: movw %ax, 44(%rsi) +; SSE3-NEXT: movd %xmm1, 48(%rsi) +; SSE3-NEXT: movw %ax, 52(%rsi) +; SSE3-NEXT: movd %xmm1, 56(%rsi) +; SSE3-NEXT: movw %ax, 60(%rsi) +; SSE3-NEXT: retq +; +; SSSE3-ONLY-LABEL: vec384_v6i8: +; SSSE3-ONLY: # %bb.0: +; SSSE3-ONLY-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSSE3-ONLY-NEXT: pxor %xmm1, %xmm1 +; SSSE3-ONLY-NEXT: psubb %xmm0, %xmm1 +; SSSE3-ONLY-NEXT: movd %xmm1, (%rsi) +; SSSE3-ONLY-NEXT: pextrw $2, %xmm1, %eax +; SSSE3-ONLY-NEXT: movw %ax, 4(%rsi) +; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rsi) +; SSSE3-ONLY-NEXT: movw %ax, 12(%rsi) +; SSSE3-ONLY-NEXT: movd %xmm1, 16(%rsi) +; SSSE3-ONLY-NEXT: movw %ax, 20(%rsi) +; SSSE3-ONLY-NEXT: movd %xmm1, 24(%rsi) +; SSSE3-ONLY-NEXT: movw %ax, 28(%rsi) +; SSSE3-ONLY-NEXT: movd %xmm1, 32(%rsi) +; SSSE3-ONLY-NEXT: movw %ax, 36(%rsi) +; SSSE3-ONLY-NEXT: movd %xmm1, 40(%rsi) +; SSSE3-ONLY-NEXT: movw %ax, 44(%rsi) +; SSSE3-ONLY-NEXT: movd %xmm1, 48(%rsi) +; SSSE3-ONLY-NEXT: movw %ax, 52(%rsi) +; SSSE3-ONLY-NEXT: movd %xmm1, 56(%rsi) +; SSSE3-ONLY-NEXT: movw %ax, 60(%rsi) +; SSSE3-ONLY-NEXT: retq +; +; SSE41-LABEL: vec384_v6i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: psubb %xmm0, %xmm1 +; SSE41-NEXT: pextrw $2, %xmm1, 4(%rsi) +; SSE41-NEXT: movd %xmm1, (%rsi) +; SSE41-NEXT: pextrw $2, %xmm1, 12(%rsi) +; SSE41-NEXT: movd %xmm1, 8(%rsi) +; SSE41-NEXT: pextrw $2, %xmm1, 20(%rsi) +; SSE41-NEXT: movd %xmm1, 16(%rsi) +; SSE41-NEXT: pextrw $2, %xmm1, 28(%rsi) +; SSE41-NEXT: movd %xmm1, 24(%rsi) +; SSE41-NEXT: pextrw $2, %xmm1, 36(%rsi) +; SSE41-NEXT: movd %xmm1, 32(%rsi) +; SSE41-NEXT: pextrw $2, %xmm1, 44(%rsi) +; SSE41-NEXT: movd %xmm1, 40(%rsi) +; SSE41-NEXT: pextrw $2, %xmm1, 52(%rsi) +; SSE41-NEXT: movd %xmm1, 48(%rsi) +; SSE41-NEXT: pextrw $2, %xmm1, 60(%rsi) +; SSE41-NEXT: movd %xmm1, 56(%rsi) +; SSE41-NEXT: retq +; +; SSE42-LABEL: vec384_v6i8: +; SSE42: # %bb.0: +; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: psubb %xmm0, %xmm1 +; SSE42-NEXT: pextrw $2, %xmm1, 4(%rsi) +; SSE42-NEXT: movd %xmm1, (%rsi) +; SSE42-NEXT: pextrw $2, %xmm1, 12(%rsi) +; SSE42-NEXT: movd %xmm1, 8(%rsi) +; SSE42-NEXT: pextrw $2, %xmm1, 20(%rsi) +; SSE42-NEXT: movd %xmm1, 16(%rsi) +; SSE42-NEXT: pextrw $2, %xmm1, 28(%rsi) +; SSE42-NEXT: movd %xmm1, 24(%rsi) +; SSE42-NEXT: pextrw $2, %xmm1, 36(%rsi) +; SSE42-NEXT: movd %xmm1, 32(%rsi) +; SSE42-NEXT: pextrw $2, %xmm1, 44(%rsi) +; SSE42-NEXT: movd %xmm1, 40(%rsi) +; SSE42-NEXT: pextrw $2, %xmm1, 52(%rsi) +; SSE42-NEXT: movd %xmm1, 48(%rsi) +; SSE42-NEXT: pextrw $2, %xmm1, 60(%rsi) +; SSE42-NEXT: movd %xmm1, 56(%rsi) +; SSE42-NEXT: retq +; +; AVX-LABEL: vec384_v6i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpextrw $2, %xmm0, 4(%rsi) +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: vpextrw $2, %xmm0, 12(%rsi) +; AVX-NEXT: vmovd %xmm0, 8(%rsi) +; AVX-NEXT: vpextrw $2, %xmm0, 20(%rsi) +; AVX-NEXT: vmovd %xmm0, 16(%rsi) +; AVX-NEXT: vpextrw $2, %xmm0, 28(%rsi) +; AVX-NEXT: vmovd %xmm0, 24(%rsi) +; AVX-NEXT: vpextrw $2, %xmm0, 36(%rsi) +; AVX-NEXT: vmovd %xmm0, 32(%rsi) +; AVX-NEXT: vpextrw $2, %xmm0, 44(%rsi) +; AVX-NEXT: vmovd %xmm0, 40(%rsi) +; AVX-NEXT: vpextrw $2, %xmm0, 52(%rsi) +; AVX-NEXT: vmovd %xmm0, 48(%rsi) +; AVX-NEXT: vpextrw $2, %xmm0, 60(%rsi) +; AVX-NEXT: vmovd %xmm0, 56(%rsi) +; AVX-NEXT: retq + %in.subvec.neg = load <6 x i8>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <6 x i8> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 0 + store <6 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 1 + store <6 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2 + %out.subvec2.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 2 + store <6 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4 + %out.subvec3.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 3 + store <6 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2 + %out.subvec4.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 4 + store <6 x i8> %in.subvec, ptr %out.subvec4.ptr, align 8 + %out.subvec5.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 5 + store <6 x i8> %in.subvec, ptr %out.subvec5.ptr, align 2 + %out.subvec6.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 6 + store <6 x i8> %in.subvec, ptr %out.subvec6.ptr, align 4 + %out.subvec7.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 7 + store <6 x i8> %in.subvec, ptr %out.subvec7.ptr, align 2 + ret void +} + +define void @vec384_v6i16(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec384_v6i16: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movl 8(%rdi), %eax +; SCALAR-NEXT: movq (%rdi), %rdx +; SCALAR-NEXT: movq %rdx, %rcx +; SCALAR-NEXT: shrq $32, %rcx +; SCALAR-NEXT: movq %rdx, %rdi +; SCALAR-NEXT: shrq $48, %rdi +; SCALAR-NEXT: negl %edi +; SCALAR-NEXT: shll $16, %edi +; SCALAR-NEXT: negl %ecx +; SCALAR-NEXT: movzwl %cx, %ecx +; SCALAR-NEXT: orl %edi, %ecx +; SCALAR-NEXT: movl %edx, %edi +; SCALAR-NEXT: andl $-65536, %edi # imm = 0xFFFF0000 +; SCALAR-NEXT: negl %edx +; SCALAR-NEXT: movzwl %dx, %edx +; SCALAR-NEXT: subl %edi, %edx +; SCALAR-NEXT: movl %eax, %edi +; SCALAR-NEXT: andl $-65536, %edi # imm = 0xFFFF0000 +; SCALAR-NEXT: negl %eax +; SCALAR-NEXT: movzwl %ax, %eax +; SCALAR-NEXT: subl %edi, %eax +; SCALAR-NEXT: movl %eax, 8(%rsi) +; SCALAR-NEXT: shlq $32, %rcx +; SCALAR-NEXT: orq %rdx, %rcx +; SCALAR-NEXT: movq %rcx, (%rsi) +; SCALAR-NEXT: movl %eax, 24(%rsi) +; SCALAR-NEXT: movq %rcx, 16(%rsi) +; SCALAR-NEXT: movl %eax, 40(%rsi) +; SCALAR-NEXT: movq %rcx, 32(%rsi) +; SCALAR-NEXT: movl %eax, 56(%rsi) +; SCALAR-NEXT: movq %rcx, 48(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-ONLY-LABEL: vec384_v6i16: +; SSE2-ONLY: # %bb.0: +; SSE2-ONLY-NEXT: pxor %xmm0, %xmm0 +; SSE2-ONLY-NEXT: psubw (%rdi), %xmm0 +; SSE2-ONLY-NEXT: movq %xmm0, (%rsi) +; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE2-ONLY-NEXT: movd %xmm1, 8(%rsi) +; SSE2-ONLY-NEXT: movq %xmm0, 16(%rsi) +; SSE2-ONLY-NEXT: movd %xmm1, 24(%rsi) +; SSE2-ONLY-NEXT: movq %xmm0, 32(%rsi) +; SSE2-ONLY-NEXT: movd %xmm1, 40(%rsi) +; SSE2-ONLY-NEXT: movq %xmm0, 48(%rsi) +; SSE2-ONLY-NEXT: movd %xmm1, 56(%rsi) +; SSE2-ONLY-NEXT: retq +; +; SSE3-LABEL: vec384_v6i16: +; SSE3: # %bb.0: +; SSE3-NEXT: pxor %xmm0, %xmm0 +; SSE3-NEXT: psubw (%rdi), %xmm0 +; SSE3-NEXT: movq %xmm0, (%rsi) +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE3-NEXT: movd %xmm1, 8(%rsi) +; SSE3-NEXT: movq %xmm0, 16(%rsi) +; SSE3-NEXT: movd %xmm1, 24(%rsi) +; SSE3-NEXT: movq %xmm0, 32(%rsi) +; SSE3-NEXT: movd %xmm1, 40(%rsi) +; SSE3-NEXT: movq %xmm0, 48(%rsi) +; SSE3-NEXT: movd %xmm1, 56(%rsi) +; SSE3-NEXT: retq +; +; SSSE3-ONLY-LABEL: vec384_v6i16: +; SSSE3-ONLY: # %bb.0: +; SSSE3-ONLY-NEXT: pxor %xmm0, %xmm0 +; SSSE3-ONLY-NEXT: psubw (%rdi), %xmm0 +; SSSE3-ONLY-NEXT: movq %xmm0, (%rsi) +; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rsi) +; SSSE3-ONLY-NEXT: movq %xmm0, 16(%rsi) +; SSSE3-ONLY-NEXT: movd %xmm1, 24(%rsi) +; SSSE3-ONLY-NEXT: movq %xmm0, 32(%rsi) +; SSSE3-ONLY-NEXT: movd %xmm1, 40(%rsi) +; SSSE3-ONLY-NEXT: movq %xmm0, 48(%rsi) +; SSSE3-ONLY-NEXT: movd %xmm1, 56(%rsi) +; SSSE3-ONLY-NEXT: retq +; +; SSE41-LABEL: vec384_v6i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: psubw (%rdi), %xmm0 +; SSE41-NEXT: pextrd $2, %xmm0, 8(%rsi) +; SSE41-NEXT: movq %xmm0, (%rsi) +; SSE41-NEXT: pextrd $2, %xmm0, 24(%rsi) +; SSE41-NEXT: movq %xmm0, 16(%rsi) +; SSE41-NEXT: pextrd $2, %xmm0, 40(%rsi) +; SSE41-NEXT: movq %xmm0, 32(%rsi) +; SSE41-NEXT: pextrd $2, %xmm0, 56(%rsi) +; SSE41-NEXT: movq %xmm0, 48(%rsi) +; SSE41-NEXT: retq +; +; SSE42-LABEL: vec384_v6i16: +; SSE42: # %bb.0: +; SSE42-NEXT: pxor %xmm0, %xmm0 +; SSE42-NEXT: psubw (%rdi), %xmm0 +; SSE42-NEXT: pextrd $2, %xmm0, 8(%rsi) +; SSE42-NEXT: movq %xmm0, (%rsi) +; SSE42-NEXT: pextrd $2, %xmm0, 24(%rsi) +; SSE42-NEXT: movq %xmm0, 16(%rsi) +; SSE42-NEXT: pextrd $2, %xmm0, 40(%rsi) +; SSE42-NEXT: movq %xmm0, 32(%rsi) +; SSE42-NEXT: pextrd $2, %xmm0, 56(%rsi) +; SSE42-NEXT: movq %xmm0, 48(%rsi) +; SSE42-NEXT: retq +; +; AVX-LABEL: vec384_v6i16: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsubw (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpextrd $2, %xmm0, 8(%rsi) +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: vpextrd $2, %xmm0, 24(%rsi) +; AVX-NEXT: vmovq %xmm0, 16(%rsi) +; AVX-NEXT: vpextrd $2, %xmm0, 40(%rsi) +; AVX-NEXT: vmovq %xmm0, 32(%rsi) +; AVX-NEXT: vpextrd $2, %xmm0, 56(%rsi) +; AVX-NEXT: vmovq %xmm0, 48(%rsi) +; AVX-NEXT: retq + %in.subvec.neg = load <6 x i16>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <6 x i16> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <6 x i16>, ptr %out.vec.ptr, i64 0 + store <6 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <6 x i16>, ptr %out.vec.ptr, i64 1 + store <6 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4 + %out.subvec2.ptr = getelementptr <6 x i16>, ptr %out.vec.ptr, i64 2 + store <6 x i16> %in.subvec, ptr %out.subvec2.ptr, align 8 + %out.subvec3.ptr = getelementptr <6 x i16>, ptr %out.vec.ptr, i64 3 + store <6 x i16> %in.subvec, ptr %out.subvec3.ptr, align 4 + ret void +} + +define void @vec384_v6i32(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec384_v6i32: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movq (%rdi), %rax +; SCALAR-NEXT: movq 8(%rdi), %rcx +; SCALAR-NEXT: movq %rax, %rdx +; SCALAR-NEXT: shrq $32, %rdx +; SCALAR-NEXT: movq %rcx, %r8 +; SCALAR-NEXT: shrq $32, %r8 +; SCALAR-NEXT: movq 16(%rdi), %rdi +; SCALAR-NEXT: movq %rdi, %r9 +; SCALAR-NEXT: shrq $32, %r9 +; SCALAR-NEXT: negl %r9d +; SCALAR-NEXT: shlq $32, %r9 +; SCALAR-NEXT: negl %edi +; SCALAR-NEXT: orq %r9, %rdi +; SCALAR-NEXT: negl %r8d +; SCALAR-NEXT: shlq $32, %r8 +; SCALAR-NEXT: negl %ecx +; SCALAR-NEXT: orq %r8, %rcx +; SCALAR-NEXT: negl %edx +; SCALAR-NEXT: shlq $32, %rdx +; SCALAR-NEXT: negl %eax +; SCALAR-NEXT: orq %rdx, %rax +; SCALAR-NEXT: movq %rax, (%rsi) +; SCALAR-NEXT: movq %rcx, 8(%rsi) +; SCALAR-NEXT: movq %rdi, 16(%rsi) +; SCALAR-NEXT: movq %rdi, 48(%rsi) +; SCALAR-NEXT: movq %rcx, 40(%rsi) +; SCALAR-NEXT: movq %rax, 32(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec384_v6i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubd 16(%rdi), %xmm1 +; SSE2-NEXT: psubd (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movq %xmm1, 16(%rsi) +; SSE2-NEXT: movq %xmm1, 48(%rsi) +; SSE2-NEXT: movdqu %xmm0, 32(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec384_v6i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsubd 16(%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpsubd (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: vmovq %xmm1, 16(%rsi) +; AVX1-NEXT: vmovq %xmm1, 48(%rsi) +; AVX1-NEXT: vmovdqu %xmm0, 32(%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec384_v6i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpsubd (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovq %xmm1, 16(%rsi) +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: vmovq %xmm1, 48(%rsi) +; AVX2-NEXT: vmovdqu %xmm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %in.subvec.neg = load <6 x i32>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <6 x i32> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <6 x i32>, ptr %out.vec.ptr, i64 0 + store <6 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <6 x i32>, ptr %out.vec.ptr, i64 1 + store <6 x i32> %in.subvec, ptr %out.subvec1.ptr, align 8 + ret void +} + +define void @vec384_v6f32(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec384_v6f32: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movq (%rdi), %rax +; SCALAR-NEXT: movq 8(%rdi), %rcx +; SCALAR-NEXT: movq %rax, %rdx +; SCALAR-NEXT: shrq $32, %rdx +; SCALAR-NEXT: movq %rcx, %r8 +; SCALAR-NEXT: shrq $32, %r8 +; SCALAR-NEXT: movq 16(%rdi), %rdi +; SCALAR-NEXT: movq %rdi, %r9 +; SCALAR-NEXT: shrq $32, %r9 +; SCALAR-NEXT: negl %r9d +; SCALAR-NEXT: shlq $32, %r9 +; SCALAR-NEXT: negl %edi +; SCALAR-NEXT: orq %r9, %rdi +; SCALAR-NEXT: negl %r8d +; SCALAR-NEXT: shlq $32, %r8 +; SCALAR-NEXT: negl %ecx +; SCALAR-NEXT: orq %r8, %rcx +; SCALAR-NEXT: negl %edx +; SCALAR-NEXT: shlq $32, %rdx +; SCALAR-NEXT: negl %eax +; SCALAR-NEXT: orq %rdx, %rax +; SCALAR-NEXT: movq %rax, (%rsi) +; SCALAR-NEXT: movq %rcx, 8(%rsi) +; SCALAR-NEXT: movq %rdi, 16(%rsi) +; SCALAR-NEXT: movq %rdi, 48(%rsi) +; SCALAR-NEXT: movq %rcx, 40(%rsi) +; SCALAR-NEXT: movq %rax, 32(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec384_v6f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubd 16(%rdi), %xmm1 +; SSE2-NEXT: psubd (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movq %xmm1, 16(%rsi) +; SSE2-NEXT: movq %xmm1, 48(%rsi) +; SSE2-NEXT: movdqu %xmm0, 32(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec384_v6f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsubd 16(%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpsubd (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: vmovq %xmm1, 16(%rsi) +; AVX1-NEXT: vmovq %xmm1, 48(%rsi) +; AVX1-NEXT: vmovdqu %xmm0, 32(%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec384_v6f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpsubd (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovq %xmm1, 16(%rsi) +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: vmovq %xmm1, 48(%rsi) +; AVX2-NEXT: vmovdqu %xmm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %in.subvec.neg = load <6 x i32>, ptr %in.subvec.ptr, align 64 + %in.subvec.int = sub <6 x i32> zeroinitializer, %in.subvec.neg + %in.subvec = bitcast <6 x i32> %in.subvec.int to <6 x float> + %out.subvec0.ptr = getelementptr <6 x float>, ptr %out.vec.ptr, i64 0 + store <6 x float> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <6 x float>, ptr %out.vec.ptr, i64 1 + store <6 x float> %in.subvec, ptr %out.subvec1.ptr, align 8 + ret void +} + +define void @vec384_v8i8(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec384_v8i8: +; SCALAR: # %bb.0: +; SCALAR-NEXT: pushq %rbx +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb (%rdi), %al +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subb 1(%rdi), %cl +; SCALAR-NEXT: xorl %r8d, %r8d +; SCALAR-NEXT: subb 2(%rdi), %r8b +; SCALAR-NEXT: xorl %r9d, %r9d +; SCALAR-NEXT: subb 3(%rdi), %r9b +; SCALAR-NEXT: xorl %r10d, %r10d +; SCALAR-NEXT: subb 4(%rdi), %r10b +; SCALAR-NEXT: xorl %r11d, %r11d +; SCALAR-NEXT: subb 5(%rdi), %r11b +; SCALAR-NEXT: xorl %ebx, %ebx +; SCALAR-NEXT: subb 6(%rdi), %bl +; SCALAR-NEXT: subb 7(%rdi), %dl +; SCALAR-NEXT: movb %dl, 7(%rsi) +; SCALAR-NEXT: movb %bl, 6(%rsi) +; SCALAR-NEXT: movb %r11b, 5(%rsi) +; SCALAR-NEXT: movb %r10b, 4(%rsi) +; SCALAR-NEXT: movb %r9b, 3(%rsi) +; SCALAR-NEXT: movb %r8b, 2(%rsi) +; SCALAR-NEXT: movb %cl, 1(%rsi) +; SCALAR-NEXT: movb %al, (%rsi) +; SCALAR-NEXT: movb %dl, 15(%rsi) +; SCALAR-NEXT: movb %bl, 14(%rsi) +; SCALAR-NEXT: movb %r11b, 13(%rsi) +; SCALAR-NEXT: movb %r10b, 12(%rsi) +; SCALAR-NEXT: movb %r9b, 11(%rsi) +; SCALAR-NEXT: movb %r8b, 10(%rsi) +; SCALAR-NEXT: movb %cl, 9(%rsi) +; SCALAR-NEXT: movb %al, 8(%rsi) +; SCALAR-NEXT: movb %dl, 23(%rsi) +; SCALAR-NEXT: movb %bl, 22(%rsi) +; SCALAR-NEXT: movb %r11b, 21(%rsi) +; SCALAR-NEXT: movb %r10b, 20(%rsi) +; SCALAR-NEXT: movb %r9b, 19(%rsi) +; SCALAR-NEXT: movb %r8b, 18(%rsi) +; SCALAR-NEXT: movb %cl, 17(%rsi) +; SCALAR-NEXT: movb %al, 16(%rsi) +; SCALAR-NEXT: movb %dl, 31(%rsi) +; SCALAR-NEXT: movb %bl, 30(%rsi) +; SCALAR-NEXT: movb %r11b, 29(%rsi) +; SCALAR-NEXT: movb %r10b, 28(%rsi) +; SCALAR-NEXT: movb %r9b, 27(%rsi) +; SCALAR-NEXT: movb %r8b, 26(%rsi) +; SCALAR-NEXT: movb %cl, 25(%rsi) +; SCALAR-NEXT: movb %al, 24(%rsi) +; SCALAR-NEXT: movb %dl, 39(%rsi) +; SCALAR-NEXT: movb %bl, 38(%rsi) +; SCALAR-NEXT: movb %r11b, 37(%rsi) +; SCALAR-NEXT: movb %r10b, 36(%rsi) +; SCALAR-NEXT: movb %r9b, 35(%rsi) +; SCALAR-NEXT: movb %r8b, 34(%rsi) +; SCALAR-NEXT: movb %cl, 33(%rsi) +; SCALAR-NEXT: movb %al, 32(%rsi) +; SCALAR-NEXT: movb %dl, 47(%rsi) +; SCALAR-NEXT: movb %bl, 46(%rsi) +; SCALAR-NEXT: movb %r11b, 45(%rsi) +; SCALAR-NEXT: movb %r10b, 44(%rsi) +; SCALAR-NEXT: movb %r9b, 43(%rsi) +; SCALAR-NEXT: movb %r8b, 42(%rsi) +; SCALAR-NEXT: movb %cl, 41(%rsi) +; SCALAR-NEXT: movb %al, 40(%rsi) +; SCALAR-NEXT: popq %rbx +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec384_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm1, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, 32(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec384_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 +; AVX1-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec384_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %in.subvec.neg = load <8 x i8>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <8 x i8> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 0 + store <8 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 1 + store <8 x i8> %in.subvec, ptr %out.subvec1.ptr, align 8 + %out.subvec2.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 2 + store <8 x i8> %in.subvec, ptr %out.subvec2.ptr, align 16 + %out.subvec3.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 3 + store <8 x i8> %in.subvec, ptr %out.subvec3.ptr, align 8 + %out.subvec4.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 4 + store <8 x i8> %in.subvec, ptr %out.subvec4.ptr, align 32 + %out.subvec5.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 5 + store <8 x i8> %in.subvec, ptr %out.subvec5.ptr, align 8 + ret void +} + +define void @vec384_v8i16(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec384_v8i16: +; SCALAR: # %bb.0: +; SCALAR-NEXT: pushq %rbx +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subw (%rdi), %ax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subw 2(%rdi), %cx +; SCALAR-NEXT: xorl %r8d, %r8d +; SCALAR-NEXT: subw 4(%rdi), %r8w +; SCALAR-NEXT: xorl %r9d, %r9d +; SCALAR-NEXT: subw 6(%rdi), %r9w +; SCALAR-NEXT: xorl %r10d, %r10d +; SCALAR-NEXT: subw 8(%rdi), %r10w +; SCALAR-NEXT: xorl %r11d, %r11d +; SCALAR-NEXT: subw 10(%rdi), %r11w +; SCALAR-NEXT: xorl %ebx, %ebx +; SCALAR-NEXT: subw 12(%rdi), %bx +; SCALAR-NEXT: subw 14(%rdi), %dx +; SCALAR-NEXT: movw %dx, 14(%rsi) +; SCALAR-NEXT: movw %bx, 12(%rsi) +; SCALAR-NEXT: movw %r11w, 10(%rsi) +; SCALAR-NEXT: movw %r10w, 8(%rsi) +; SCALAR-NEXT: movw %r9w, 6(%rsi) +; SCALAR-NEXT: movw %r8w, 4(%rsi) +; SCALAR-NEXT: movw %cx, 2(%rsi) +; SCALAR-NEXT: movw %ax, (%rsi) +; SCALAR-NEXT: movw %dx, 30(%rsi) +; SCALAR-NEXT: movw %bx, 28(%rsi) +; SCALAR-NEXT: movw %r11w, 26(%rsi) +; SCALAR-NEXT: movw %r10w, 24(%rsi) +; SCALAR-NEXT: movw %r9w, 22(%rsi) +; SCALAR-NEXT: movw %r8w, 20(%rsi) +; SCALAR-NEXT: movw %cx, 18(%rsi) +; SCALAR-NEXT: movw %ax, 16(%rsi) +; SCALAR-NEXT: movw %dx, 46(%rsi) +; SCALAR-NEXT: movw %bx, 44(%rsi) +; SCALAR-NEXT: movw %r11w, 42(%rsi) +; SCALAR-NEXT: movw %r10w, 40(%rsi) +; SCALAR-NEXT: movw %r9w, 38(%rsi) +; SCALAR-NEXT: movw %r8w, 36(%rsi) +; SCALAR-NEXT: movw %cx, 34(%rsi) +; SCALAR-NEXT: movw %ax, 32(%rsi) +; SCALAR-NEXT: popq %rbx +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec384_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psubw (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: movdqa %xmm0, 32(%rsi) +; SSE2-NEXT: retq +; +; AVX-LABEL: vec384_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsubw (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX-NEXT: retq + %in.subvec.neg = load <8 x i16>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <8 x i16> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 0 + store <8 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 1 + store <8 x i16> %in.subvec, ptr %out.subvec1.ptr, align 16 + %out.subvec2.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 2 + store <8 x i16> %in.subvec, ptr %out.subvec2.ptr, align 32 + ret void +} + +define void @vec384_v12i8(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec384_v12i8: +; SCALAR: # %bb.0: +; SCALAR-NEXT: pushq %rbp +; SCALAR-NEXT: pushq %r15 +; SCALAR-NEXT: pushq %r14 +; SCALAR-NEXT: pushq %rbx +; SCALAR-NEXT: movq (%rdi), %r8 +; SCALAR-NEXT: movq 8(%rdi), %rcx +; SCALAR-NEXT: movl %ecx, %eax +; SCALAR-NEXT: shrl $8, %eax +; SCALAR-NEXT: movl %ecx, %edx +; SCALAR-NEXT: shrl $24, %edx +; SCALAR-NEXT: movl %ecx, %edi +; SCALAR-NEXT: shrl $16, %edi +; SCALAR-NEXT: movq %r8, %r9 +; SCALAR-NEXT: shrq $40, %r9 +; SCALAR-NEXT: movq %r8, %r10 +; SCALAR-NEXT: shrq $32, %r10 +; SCALAR-NEXT: movq %r8, %r11 +; SCALAR-NEXT: shrq $56, %r11 +; SCALAR-NEXT: movq %r8, %rbx +; SCALAR-NEXT: shrq $48, %rbx +; SCALAR-NEXT: movl %r8d, %ebp +; SCALAR-NEXT: shrl $8, %ebp +; SCALAR-NEXT: movl %r8d, %r14d +; SCALAR-NEXT: shrl $24, %r14d +; SCALAR-NEXT: movl %r8d, %r15d +; SCALAR-NEXT: shrl $16, %r15d +; SCALAR-NEXT: negb %r15b +; SCALAR-NEXT: movzbl %r15b, %r15d +; SCALAR-NEXT: negb %r14b +; SCALAR-NEXT: movzbl %r14b, %r14d +; SCALAR-NEXT: shll $8, %r14d +; SCALAR-NEXT: orl %r15d, %r14d +; SCALAR-NEXT: shll $16, %r14d +; SCALAR-NEXT: negb %r8b +; SCALAR-NEXT: movzbl %r8b, %r8d +; SCALAR-NEXT: negb %bpl +; SCALAR-NEXT: movzbl %bpl, %ebp +; SCALAR-NEXT: shll $8, %ebp +; SCALAR-NEXT: orl %r8d, %ebp +; SCALAR-NEXT: movzwl %bp, %r8d +; SCALAR-NEXT: orl %r14d, %r8d +; SCALAR-NEXT: negb %bl +; SCALAR-NEXT: movzbl %bl, %ebx +; SCALAR-NEXT: negb %r11b +; SCALAR-NEXT: movzbl %r11b, %r11d +; SCALAR-NEXT: shll $8, %r11d +; SCALAR-NEXT: orl %ebx, %r11d +; SCALAR-NEXT: shll $16, %r11d +; SCALAR-NEXT: negb %r10b +; SCALAR-NEXT: movzbl %r10b, %r10d +; SCALAR-NEXT: negb %r9b +; SCALAR-NEXT: movzbl %r9b, %r9d +; SCALAR-NEXT: shll $8, %r9d +; SCALAR-NEXT: orl %r10d, %r9d +; SCALAR-NEXT: movzwl %r9w, %r9d +; SCALAR-NEXT: orl %r11d, %r9d +; SCALAR-NEXT: negb %dil +; SCALAR-NEXT: movzbl %dil, %edi +; SCALAR-NEXT: negb %dl +; SCALAR-NEXT: movzbl %dl, %edx +; SCALAR-NEXT: shll $8, %edx +; SCALAR-NEXT: orl %edi, %edx +; SCALAR-NEXT: shll $16, %edx +; SCALAR-NEXT: negb %cl +; SCALAR-NEXT: movzbl %cl, %ecx +; SCALAR-NEXT: negb %al +; SCALAR-NEXT: movzbl %al, %eax +; SCALAR-NEXT: shll $8, %eax +; SCALAR-NEXT: orl %ecx, %eax +; SCALAR-NEXT: movzwl %ax, %eax +; SCALAR-NEXT: orl %edx, %eax +; SCALAR-NEXT: movl %eax, 8(%rsi) +; SCALAR-NEXT: shlq $32, %r9 +; SCALAR-NEXT: orq %r9, %r8 +; SCALAR-NEXT: movq %r8, (%rsi) +; SCALAR-NEXT: movl %eax, 24(%rsi) +; SCALAR-NEXT: movq %r8, 16(%rsi) +; SCALAR-NEXT: movl %eax, 40(%rsi) +; SCALAR-NEXT: movq %r8, 32(%rsi) +; SCALAR-NEXT: movl %eax, 56(%rsi) +; SCALAR-NEXT: movq %r8, 48(%rsi) +; SCALAR-NEXT: popq %rbx +; SCALAR-NEXT: popq %r14 +; SCALAR-NEXT: popq %r15 +; SCALAR-NEXT: popq %rbp +; SCALAR-NEXT: retq +; +; SSE2-ONLY-LABEL: vec384_v12i8: +; SSE2-ONLY: # %bb.0: +; SSE2-ONLY-NEXT: pxor %xmm0, %xmm0 +; SSE2-ONLY-NEXT: psubb (%rdi), %xmm0 +; SSE2-ONLY-NEXT: movq %xmm0, (%rsi) +; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE2-ONLY-NEXT: movd %xmm1, 8(%rsi) +; SSE2-ONLY-NEXT: movq %xmm0, 16(%rsi) +; SSE2-ONLY-NEXT: movd %xmm1, 24(%rsi) +; SSE2-ONLY-NEXT: movq %xmm0, 32(%rsi) +; SSE2-ONLY-NEXT: movd %xmm1, 40(%rsi) +; SSE2-ONLY-NEXT: movq %xmm0, 48(%rsi) +; SSE2-ONLY-NEXT: movd %xmm1, 56(%rsi) +; SSE2-ONLY-NEXT: retq +; +; SSE3-LABEL: vec384_v12i8: +; SSE3: # %bb.0: +; SSE3-NEXT: pxor %xmm0, %xmm0 +; SSE3-NEXT: psubb (%rdi), %xmm0 +; SSE3-NEXT: movq %xmm0, (%rsi) +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE3-NEXT: movd %xmm1, 8(%rsi) +; SSE3-NEXT: movq %xmm0, 16(%rsi) +; SSE3-NEXT: movd %xmm1, 24(%rsi) +; SSE3-NEXT: movq %xmm0, 32(%rsi) +; SSE3-NEXT: movd %xmm1, 40(%rsi) +; SSE3-NEXT: movq %xmm0, 48(%rsi) +; SSE3-NEXT: movd %xmm1, 56(%rsi) +; SSE3-NEXT: retq +; +; SSSE3-ONLY-LABEL: vec384_v12i8: +; SSSE3-ONLY: # %bb.0: +; SSSE3-ONLY-NEXT: pxor %xmm0, %xmm0 +; SSSE3-ONLY-NEXT: psubb (%rdi), %xmm0 +; SSSE3-ONLY-NEXT: movq %xmm0, (%rsi) +; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rsi) +; SSSE3-ONLY-NEXT: movq %xmm0, 16(%rsi) +; SSSE3-ONLY-NEXT: movd %xmm1, 24(%rsi) +; SSSE3-ONLY-NEXT: movq %xmm0, 32(%rsi) +; SSSE3-ONLY-NEXT: movd %xmm1, 40(%rsi) +; SSSE3-ONLY-NEXT: movq %xmm0, 48(%rsi) +; SSSE3-ONLY-NEXT: movd %xmm1, 56(%rsi) +; SSSE3-ONLY-NEXT: retq +; +; SSE41-LABEL: vec384_v12i8: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: psubb (%rdi), %xmm0 +; SSE41-NEXT: pextrd $2, %xmm0, 8(%rsi) +; SSE41-NEXT: movq %xmm0, (%rsi) +; SSE41-NEXT: pextrd $2, %xmm0, 24(%rsi) +; SSE41-NEXT: movq %xmm0, 16(%rsi) +; SSE41-NEXT: pextrd $2, %xmm0, 40(%rsi) +; SSE41-NEXT: movq %xmm0, 32(%rsi) +; SSE41-NEXT: pextrd $2, %xmm0, 56(%rsi) +; SSE41-NEXT: movq %xmm0, 48(%rsi) +; SSE41-NEXT: retq +; +; SSE42-LABEL: vec384_v12i8: +; SSE42: # %bb.0: +; SSE42-NEXT: pxor %xmm0, %xmm0 +; SSE42-NEXT: psubb (%rdi), %xmm0 +; SSE42-NEXT: pextrd $2, %xmm0, 8(%rsi) +; SSE42-NEXT: movq %xmm0, (%rsi) +; SSE42-NEXT: pextrd $2, %xmm0, 24(%rsi) +; SSE42-NEXT: movq %xmm0, 16(%rsi) +; SSE42-NEXT: pextrd $2, %xmm0, 40(%rsi) +; SSE42-NEXT: movq %xmm0, 32(%rsi) +; SSE42-NEXT: pextrd $2, %xmm0, 56(%rsi) +; SSE42-NEXT: movq %xmm0, 48(%rsi) +; SSE42-NEXT: retq +; +; AVX-LABEL: vec384_v12i8: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsubb (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpextrd $2, %xmm0, 8(%rsi) +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: vpextrd $2, %xmm0, 24(%rsi) +; AVX-NEXT: vmovq %xmm0, 16(%rsi) +; AVX-NEXT: vpextrd $2, %xmm0, 40(%rsi) +; AVX-NEXT: vmovq %xmm0, 32(%rsi) +; AVX-NEXT: vpextrd $2, %xmm0, 56(%rsi) +; AVX-NEXT: vmovq %xmm0, 48(%rsi) +; AVX-NEXT: retq + %in.subvec.neg = load <12 x i8>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <12 x i8> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <12 x i8>, ptr %out.vec.ptr, i64 0 + store <12 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <12 x i8>, ptr %out.vec.ptr, i64 1 + store <12 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4 + %out.subvec2.ptr = getelementptr <12 x i8>, ptr %out.vec.ptr, i64 2 + store <12 x i8> %in.subvec, ptr %out.subvec2.ptr, align 8 + %out.subvec3.ptr = getelementptr <12 x i8>, ptr %out.vec.ptr, i64 3 + store <12 x i8> %in.subvec, ptr %out.subvec3.ptr, align 4 + ret void +} + +define void @vec384_v12i16(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec384_v12i16: +; SCALAR: # %bb.0: +; SCALAR-NEXT: pushq %rbp +; SCALAR-NEXT: pushq %r14 +; SCALAR-NEXT: pushq %rbx +; SCALAR-NEXT: movq (%rdi), %rdx +; SCALAR-NEXT: movq 8(%rdi), %r10 +; SCALAR-NEXT: movq %rdx, %rax +; SCALAR-NEXT: shrq $32, %rax +; SCALAR-NEXT: movq %rdx, %rcx +; SCALAR-NEXT: shrq $48, %rcx +; SCALAR-NEXT: movq %r10, %r8 +; SCALAR-NEXT: shrq $32, %r8 +; SCALAR-NEXT: movq %r10, %r9 +; SCALAR-NEXT: shrq $48, %r9 +; SCALAR-NEXT: movq 16(%rdi), %rdi +; SCALAR-NEXT: movq %rdi, %r11 +; SCALAR-NEXT: shrq $32, %r11 +; SCALAR-NEXT: movq %rdi, %rbx +; SCALAR-NEXT: shrq $48, %rbx +; SCALAR-NEXT: movl %edi, %ebp +; SCALAR-NEXT: andl $-65536, %ebp # imm = 0xFFFF0000 +; SCALAR-NEXT: negl %edi +; SCALAR-NEXT: movzwl %di, %r14d +; SCALAR-NEXT: subl %ebp, %r14d +; SCALAR-NEXT: negl %ebx +; SCALAR-NEXT: shll $16, %ebx +; SCALAR-NEXT: negl %r11d +; SCALAR-NEXT: movzwl %r11w, %edi +; SCALAR-NEXT: orl %ebx, %edi +; SCALAR-NEXT: shlq $32, %rdi +; SCALAR-NEXT: orq %r14, %rdi +; SCALAR-NEXT: movl %r10d, %r11d +; SCALAR-NEXT: andl $-65536, %r11d # imm = 0xFFFF0000 +; SCALAR-NEXT: negl %r10d +; SCALAR-NEXT: movzwl %r10w, %r10d +; SCALAR-NEXT: subl %r11d, %r10d +; SCALAR-NEXT: negl %r9d +; SCALAR-NEXT: shll $16, %r9d +; SCALAR-NEXT: negl %r8d +; SCALAR-NEXT: movzwl %r8w, %r8d +; SCALAR-NEXT: orl %r9d, %r8d +; SCALAR-NEXT: shlq $32, %r8 +; SCALAR-NEXT: orq %r10, %r8 +; SCALAR-NEXT: movl %edx, %r9d +; SCALAR-NEXT: andl $-65536, %r9d # imm = 0xFFFF0000 +; SCALAR-NEXT: negl %edx +; SCALAR-NEXT: movzwl %dx, %edx +; SCALAR-NEXT: subl %r9d, %edx +; SCALAR-NEXT: negl %ecx +; SCALAR-NEXT: shll $16, %ecx +; SCALAR-NEXT: negl %eax +; SCALAR-NEXT: movzwl %ax, %eax +; SCALAR-NEXT: orl %ecx, %eax +; SCALAR-NEXT: shlq $32, %rax +; SCALAR-NEXT: orq %rdx, %rax +; SCALAR-NEXT: movq %rax, (%rsi) +; SCALAR-NEXT: movq %r8, 8(%rsi) +; SCALAR-NEXT: movq %rdi, 16(%rsi) +; SCALAR-NEXT: movq %rdi, 48(%rsi) +; SCALAR-NEXT: movq %r8, 40(%rsi) +; SCALAR-NEXT: movq %rax, 32(%rsi) +; SCALAR-NEXT: popq %rbx +; SCALAR-NEXT: popq %r14 +; SCALAR-NEXT: popq %rbp +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec384_v12i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubw 16(%rdi), %xmm1 +; SSE2-NEXT: psubw (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movq %xmm1, 16(%rsi) +; SSE2-NEXT: movq %xmm1, 48(%rsi) +; SSE2-NEXT: movdqu %xmm0, 32(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec384_v12i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsubw 16(%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpsubw (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: vmovq %xmm1, 16(%rsi) +; AVX1-NEXT: vmovq %xmm1, 48(%rsi) +; AVX1-NEXT: vmovdqu %xmm0, 32(%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec384_v12i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpsubw (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovq %xmm1, 16(%rsi) +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: vmovq %xmm1, 48(%rsi) +; AVX2-NEXT: vmovdqu %xmm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %in.subvec.neg = load <12 x i16>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <12 x i16> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <12 x i16>, ptr %out.vec.ptr, i64 0 + store <12 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <12 x i16>, ptr %out.vec.ptr, i64 1 + store <12 x i16> %in.subvec, ptr %out.subvec1.ptr, align 8 + ret void +} + +define void @vec384_v16i8(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec384_v16i8: +; SCALAR: # %bb.0: +; SCALAR-NEXT: pushq %rbp +; SCALAR-NEXT: pushq %r15 +; SCALAR-NEXT: pushq %r14 +; SCALAR-NEXT: pushq %r13 +; SCALAR-NEXT: pushq %r12 +; SCALAR-NEXT: pushq %rbx +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb (%rdi), %al +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 1(%rdi), %al +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 2(%rdi), %al +; SCALAR-NEXT: movl %eax, %edx +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 3(%rdi), %al +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 4(%rdi), %al +; SCALAR-NEXT: movl %eax, %r8d +; SCALAR-NEXT: xorl %r10d, %r10d +; SCALAR-NEXT: subb 5(%rdi), %r10b +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 6(%rdi), %al +; SCALAR-NEXT: movl %eax, %r11d +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 7(%rdi), %al +; SCALAR-NEXT: movl %eax, %ebx +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %ebp, %ebp +; SCALAR-NEXT: subb 8(%rdi), %bpl +; SCALAR-NEXT: xorl %r14d, %r14d +; SCALAR-NEXT: subb 9(%rdi), %r14b +; SCALAR-NEXT: xorl %r15d, %r15d +; SCALAR-NEXT: subb 10(%rdi), %r15b +; SCALAR-NEXT: xorl %r12d, %r12d +; SCALAR-NEXT: subb 11(%rdi), %r12b +; SCALAR-NEXT: xorl %r13d, %r13d +; SCALAR-NEXT: subb 12(%rdi), %r13b +; SCALAR-NEXT: xorl %r9d, %r9d +; SCALAR-NEXT: subb 13(%rdi), %r9b +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subb 14(%rdi), %cl +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 15(%rdi), %al +; SCALAR-NEXT: movb %al, 15(%rsi) +; SCALAR-NEXT: movb %cl, 14(%rsi) +; SCALAR-NEXT: movb %r9b, 13(%rsi) +; SCALAR-NEXT: movb %r13b, 12(%rsi) +; SCALAR-NEXT: movb %r12b, 11(%rsi) +; SCALAR-NEXT: movb %r15b, 10(%rsi) +; SCALAR-NEXT: movb %r14b, 9(%rsi) +; SCALAR-NEXT: movb %bpl, 8(%rsi) +; SCALAR-NEXT: movb %bl, 7(%rsi) +; SCALAR-NEXT: movb %r11b, 6(%rsi) +; SCALAR-NEXT: movb %r10b, 5(%rsi) +; SCALAR-NEXT: movb %r8b, 4(%rsi) +; SCALAR-NEXT: movl %r8d, %r11d +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload +; SCALAR-NEXT: movb %r8b, 3(%rsi) +; SCALAR-NEXT: movb %dl, 2(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload +; SCALAR-NEXT: movb %dl, 1(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; SCALAR-NEXT: movb %dil, (%rsi) +; SCALAR-NEXT: movb %al, 31(%rsi) +; SCALAR-NEXT: movb %cl, 30(%rsi) +; SCALAR-NEXT: movb %r9b, 29(%rsi) +; SCALAR-NEXT: movb %r13b, 28(%rsi) +; SCALAR-NEXT: movb %r12b, 27(%rsi) +; SCALAR-NEXT: movb %r15b, 26(%rsi) +; SCALAR-NEXT: movb %r14b, 25(%rsi) +; SCALAR-NEXT: movb %bpl, 24(%rsi) +; SCALAR-NEXT: movb %bl, 23(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Reload +; SCALAR-NEXT: movb %bl, 22(%rsi) +; SCALAR-NEXT: movb %r10b, 21(%rsi) +; SCALAR-NEXT: movl %r10d, %ebx +; SCALAR-NEXT: movb %r11b, 20(%rsi) +; SCALAR-NEXT: movl %r11d, %r10d +; SCALAR-NEXT: movb %r8b, 19(%rsi) +; SCALAR-NEXT: movl %r8d, %r11d +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload +; SCALAR-NEXT: movb %r8b, 18(%rsi) +; SCALAR-NEXT: movb %dl, 17(%rsi) +; SCALAR-NEXT: movb %dil, 16(%rsi) +; SCALAR-NEXT: movb %al, 47(%rsi) +; SCALAR-NEXT: movb %cl, 46(%rsi) +; SCALAR-NEXT: movb %r9b, 45(%rsi) +; SCALAR-NEXT: movb %r13b, 44(%rsi) +; SCALAR-NEXT: movb %r12b, 43(%rsi) +; SCALAR-NEXT: movb %r15b, 42(%rsi) +; SCALAR-NEXT: movb %r14b, 41(%rsi) +; SCALAR-NEXT: movb %bpl, 40(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SCALAR-NEXT: movb %al, 39(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SCALAR-NEXT: movb %al, 38(%rsi) +; SCALAR-NEXT: movb %bl, 37(%rsi) +; SCALAR-NEXT: movb %r10b, 36(%rsi) +; SCALAR-NEXT: movb %r11b, 35(%rsi) +; SCALAR-NEXT: movb %r8b, 34(%rsi) +; SCALAR-NEXT: movb %dl, 33(%rsi) +; SCALAR-NEXT: movb %dil, 32(%rsi) +; SCALAR-NEXT: popq %rbx +; SCALAR-NEXT: popq %r12 +; SCALAR-NEXT: popq %r13 +; SCALAR-NEXT: popq %r14 +; SCALAR-NEXT: popq %r15 +; SCALAR-NEXT: popq %rbp +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec384_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psubb (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: movdqa %xmm0, 32(%rsi) +; SSE2-NEXT: retq +; +; AVX-LABEL: vec384_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsubb (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX-NEXT: retq + %in.subvec.neg = load <16 x i8>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <16 x i8> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 0 + store <16 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 1 + store <16 x i8> %in.subvec, ptr %out.subvec1.ptr, align 16 + %out.subvec2.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 2 + store <16 x i8> %in.subvec, ptr %out.subvec2.ptr, align 32 + ret void +} + +define void @vec384_v24i8(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec384_v24i8: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movq (%rdi), %rax +; SCALAR-NEXT: movq 8(%rdi), %rcx +; SCALAR-NEXT: movq 16(%rdi), %rdx +; SCALAR-NEXT: movq %rdx, %rdi +; SCALAR-NEXT: shrq $40, %rdi +; SCALAR-NEXT: movq %rdx, %r8 +; SCALAR-NEXT: shrq $56, %r8 +; SCALAR-NEXT: movq %rdx, %r9 +; SCALAR-NEXT: shrq $48, %r9 +; SCALAR-NEXT: negb %r9b +; SCALAR-NEXT: movzbl %r9b, %r9d +; SCALAR-NEXT: negb %r8b +; SCALAR-NEXT: movzbl %r8b, %r8d +; SCALAR-NEXT: shll $8, %r8d +; SCALAR-NEXT: orl %r9d, %r8d +; SCALAR-NEXT: movq %rdx, %r9 +; SCALAR-NEXT: shrq $32, %r9 +; SCALAR-NEXT: negb %r9b +; SCALAR-NEXT: movzbl %r9b, %r9d +; SCALAR-NEXT: negb %dil +; SCALAR-NEXT: movzbl %dil, %edi +; SCALAR-NEXT: shll $8, %edi +; SCALAR-NEXT: orl %r9d, %edi +; SCALAR-NEXT: movl %edx, %r9d +; SCALAR-NEXT: shrl $24, %r9d +; SCALAR-NEXT: shll $16, %r8d +; SCALAR-NEXT: movzwl %di, %edi +; SCALAR-NEXT: orl %r8d, %edi +; SCALAR-NEXT: movl %edx, %r8d +; SCALAR-NEXT: shrl $16, %r8d +; SCALAR-NEXT: negb %r8b +; SCALAR-NEXT: movzbl %r8b, %r8d +; SCALAR-NEXT: negb %r9b +; SCALAR-NEXT: movzbl %r9b, %r9d +; SCALAR-NEXT: shll $8, %r9d +; SCALAR-NEXT: orl %r8d, %r9d +; SCALAR-NEXT: movl %edx, %r8d +; SCALAR-NEXT: shrl $8, %r8d +; SCALAR-NEXT: negb %dl +; SCALAR-NEXT: movzbl %dl, %edx +; SCALAR-NEXT: negb %r8b +; SCALAR-NEXT: movzbl %r8b, %r8d +; SCALAR-NEXT: shll $8, %r8d +; SCALAR-NEXT: orl %edx, %r8d +; SCALAR-NEXT: movq %rcx, %r10 +; SCALAR-NEXT: shrq $40, %r10 +; SCALAR-NEXT: shll $16, %r9d +; SCALAR-NEXT: movzwl %r8w, %edx +; SCALAR-NEXT: orl %r9d, %edx +; SCALAR-NEXT: movq %rcx, %r8 +; SCALAR-NEXT: shrq $56, %r8 +; SCALAR-NEXT: shlq $32, %rdi +; SCALAR-NEXT: orq %rdi, %rdx +; SCALAR-NEXT: movq %rcx, %rdi +; SCALAR-NEXT: shrq $48, %rdi +; SCALAR-NEXT: negb %dil +; SCALAR-NEXT: movzbl %dil, %edi +; SCALAR-NEXT: negb %r8b +; SCALAR-NEXT: movzbl %r8b, %r8d +; SCALAR-NEXT: shll $8, %r8d +; SCALAR-NEXT: orl %edi, %r8d +; SCALAR-NEXT: movq %rcx, %rdi +; SCALAR-NEXT: shrq $32, %rdi +; SCALAR-NEXT: negb %dil +; SCALAR-NEXT: movzbl %dil, %edi +; SCALAR-NEXT: negb %r10b +; SCALAR-NEXT: movzbl %r10b, %r9d +; SCALAR-NEXT: shll $8, %r9d +; SCALAR-NEXT: orl %edi, %r9d +; SCALAR-NEXT: movl %ecx, %r10d +; SCALAR-NEXT: shrl $24, %r10d +; SCALAR-NEXT: shll $16, %r8d +; SCALAR-NEXT: movzwl %r9w, %edi +; SCALAR-NEXT: orl %r8d, %edi +; SCALAR-NEXT: movl %ecx, %r8d +; SCALAR-NEXT: shrl $16, %r8d +; SCALAR-NEXT: negb %r8b +; SCALAR-NEXT: movzbl %r8b, %r8d +; SCALAR-NEXT: negb %r10b +; SCALAR-NEXT: movzbl %r10b, %r9d +; SCALAR-NEXT: shll $8, %r9d +; SCALAR-NEXT: orl %r8d, %r9d +; SCALAR-NEXT: movl %ecx, %r8d +; SCALAR-NEXT: shrl $8, %r8d +; SCALAR-NEXT: negb %cl +; SCALAR-NEXT: movzbl %cl, %ecx +; SCALAR-NEXT: negb %r8b +; SCALAR-NEXT: movzbl %r8b, %r8d +; SCALAR-NEXT: shll $8, %r8d +; SCALAR-NEXT: orl %ecx, %r8d +; SCALAR-NEXT: movq %rax, %r10 +; SCALAR-NEXT: shrq $40, %r10 +; SCALAR-NEXT: shll $16, %r9d +; SCALAR-NEXT: movzwl %r8w, %ecx +; SCALAR-NEXT: orl %r9d, %ecx +; SCALAR-NEXT: movq %rax, %r8 +; SCALAR-NEXT: shrq $56, %r8 +; SCALAR-NEXT: shlq $32, %rdi +; SCALAR-NEXT: orq %rdi, %rcx +; SCALAR-NEXT: movq %rax, %rdi +; SCALAR-NEXT: shrq $48, %rdi +; SCALAR-NEXT: negb %dil +; SCALAR-NEXT: movzbl %dil, %edi +; SCALAR-NEXT: negb %r8b +; SCALAR-NEXT: movzbl %r8b, %r8d +; SCALAR-NEXT: shll $8, %r8d +; SCALAR-NEXT: orl %edi, %r8d +; SCALAR-NEXT: movq %rax, %rdi +; SCALAR-NEXT: shrq $32, %rdi +; SCALAR-NEXT: negb %dil +; SCALAR-NEXT: movzbl %dil, %edi +; SCALAR-NEXT: negb %r10b +; SCALAR-NEXT: movzbl %r10b, %r9d +; SCALAR-NEXT: shll $8, %r9d +; SCALAR-NEXT: orl %edi, %r9d +; SCALAR-NEXT: movl %eax, %r10d +; SCALAR-NEXT: shrl $24, %r10d +; SCALAR-NEXT: shll $16, %r8d +; SCALAR-NEXT: movzwl %r9w, %edi +; SCALAR-NEXT: orl %r8d, %edi +; SCALAR-NEXT: movl %eax, %r8d +; SCALAR-NEXT: shrl $16, %r8d +; SCALAR-NEXT: negb %r8b +; SCALAR-NEXT: movzbl %r8b, %r8d +; SCALAR-NEXT: negb %r10b +; SCALAR-NEXT: movzbl %r10b, %r9d +; SCALAR-NEXT: shll $8, %r9d +; SCALAR-NEXT: orl %r8d, %r9d +; SCALAR-NEXT: movl %eax, %r8d +; SCALAR-NEXT: shrl $8, %r8d +; SCALAR-NEXT: negb %al +; SCALAR-NEXT: movzbl %al, %eax +; SCALAR-NEXT: negb %r8b +; SCALAR-NEXT: movzbl %r8b, %r8d +; SCALAR-NEXT: shll $8, %r8d +; SCALAR-NEXT: orl %eax, %r8d +; SCALAR-NEXT: shll $16, %r9d +; SCALAR-NEXT: movzwl %r8w, %eax +; SCALAR-NEXT: orl %r9d, %eax +; SCALAR-NEXT: shlq $32, %rdi +; SCALAR-NEXT: orq %rdi, %rax +; SCALAR-NEXT: movq %rax, (%rsi) +; SCALAR-NEXT: movq %rcx, 8(%rsi) +; SCALAR-NEXT: movq %rdx, 16(%rsi) +; SCALAR-NEXT: movq %rdx, 48(%rsi) +; SCALAR-NEXT: movq %rcx, 40(%rsi) +; SCALAR-NEXT: movq %rax, 32(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec384_v24i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubb 16(%rdi), %xmm1 +; SSE2-NEXT: psubb (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movq %xmm1, 16(%rsi) +; SSE2-NEXT: movq %xmm1, 48(%rsi) +; SSE2-NEXT: movdqu %xmm0, 32(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec384_v24i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb 16(%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpsubb (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: vmovq %xmm1, 16(%rsi) +; AVX1-NEXT: vmovq %xmm1, 48(%rsi) +; AVX1-NEXT: vmovdqu %xmm0, 32(%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec384_v24i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovq %xmm1, 16(%rsi) +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: vmovq %xmm1, 48(%rsi) +; AVX2-NEXT: vmovdqu %xmm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %in.subvec.neg = load <24 x i8>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <24 x i8> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <24 x i8>, ptr %out.vec.ptr, i64 0 + store <24 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <24 x i8>, ptr %out.vec.ptr, i64 1 + store <24 x i8> %in.subvec, ptr %out.subvec1.ptr, align 8 + ret void +} + +define void @vec512_v2i8(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec512_v2i8: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subb (%rdi), %cl +; SCALAR-NEXT: subb 1(%rdi), %al +; SCALAR-NEXT: movb %al, 1(%rsi) +; SCALAR-NEXT: movb %cl, (%rsi) +; SCALAR-NEXT: movb %al, 3(%rsi) +; SCALAR-NEXT: movb %cl, 2(%rsi) +; SCALAR-NEXT: movb %al, 5(%rsi) +; SCALAR-NEXT: movb %cl, 4(%rsi) +; SCALAR-NEXT: movb %al, 7(%rsi) +; SCALAR-NEXT: movb %cl, 6(%rsi) +; SCALAR-NEXT: movb %al, 9(%rsi) +; SCALAR-NEXT: movb %cl, 8(%rsi) +; SCALAR-NEXT: movb %al, 11(%rsi) +; SCALAR-NEXT: movb %cl, 10(%rsi) +; SCALAR-NEXT: movb %al, 13(%rsi) +; SCALAR-NEXT: movb %cl, 12(%rsi) +; SCALAR-NEXT: movb %al, 15(%rsi) +; SCALAR-NEXT: movb %cl, 14(%rsi) +; SCALAR-NEXT: movb %al, 17(%rsi) +; SCALAR-NEXT: movb %cl, 16(%rsi) +; SCALAR-NEXT: movb %al, 19(%rsi) +; SCALAR-NEXT: movb %cl, 18(%rsi) +; SCALAR-NEXT: movb %al, 21(%rsi) +; SCALAR-NEXT: movb %cl, 20(%rsi) +; SCALAR-NEXT: movb %al, 23(%rsi) +; SCALAR-NEXT: movb %cl, 22(%rsi) +; SCALAR-NEXT: movb %al, 25(%rsi) +; SCALAR-NEXT: movb %cl, 24(%rsi) +; SCALAR-NEXT: movb %al, 27(%rsi) +; SCALAR-NEXT: movb %cl, 26(%rsi) +; SCALAR-NEXT: movb %al, 29(%rsi) +; SCALAR-NEXT: movb %cl, 28(%rsi) +; SCALAR-NEXT: movb %al, 31(%rsi) +; SCALAR-NEXT: movb %cl, 30(%rsi) +; SCALAR-NEXT: movb %al, 33(%rsi) +; SCALAR-NEXT: movb %cl, 32(%rsi) +; SCALAR-NEXT: movb %al, 35(%rsi) +; SCALAR-NEXT: movb %cl, 34(%rsi) +; SCALAR-NEXT: movb %al, 37(%rsi) +; SCALAR-NEXT: movb %cl, 36(%rsi) +; SCALAR-NEXT: movb %al, 39(%rsi) +; SCALAR-NEXT: movb %cl, 38(%rsi) +; SCALAR-NEXT: movb %al, 41(%rsi) +; SCALAR-NEXT: movb %cl, 40(%rsi) +; SCALAR-NEXT: movb %al, 43(%rsi) +; SCALAR-NEXT: movb %cl, 42(%rsi) +; SCALAR-NEXT: movb %al, 45(%rsi) +; SCALAR-NEXT: movb %cl, 44(%rsi) +; SCALAR-NEXT: movb %al, 47(%rsi) +; SCALAR-NEXT: movb %cl, 46(%rsi) +; SCALAR-NEXT: movb %al, 49(%rsi) +; SCALAR-NEXT: movb %cl, 48(%rsi) +; SCALAR-NEXT: movb %al, 51(%rsi) +; SCALAR-NEXT: movb %cl, 50(%rsi) +; SCALAR-NEXT: movb %al, 53(%rsi) +; SCALAR-NEXT: movb %cl, 52(%rsi) +; SCALAR-NEXT: movb %al, 55(%rsi) +; SCALAR-NEXT: movb %cl, 54(%rsi) +; SCALAR-NEXT: movb %al, 57(%rsi) +; SCALAR-NEXT: movb %cl, 56(%rsi) +; SCALAR-NEXT: movb %al, 59(%rsi) +; SCALAR-NEXT: movb %cl, 58(%rsi) +; SCALAR-NEXT: movb %al, 61(%rsi) +; SCALAR-NEXT: movb %cl, 60(%rsi) +; SCALAR-NEXT: movb %al, 63(%rsi) +; SCALAR-NEXT: movb %cl, 62(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec512_v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm1, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, 32(%rsi) +; SSE2-NEXT: movdqa %xmm1, 48(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec512_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-ONLY-LABEL: vec512_v2i8: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpsubb (%rdi), %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rsi) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq +; +; AVX512F-LABEL: vec512_v2i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpsubb (%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec512_v2i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsubb (%rdi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastw %xmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %in.subvec.neg = load <2 x i8>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <2 x i8> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0 + store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1 + store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2 + %out.subvec2.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 2 + store <2 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4 + %out.subvec3.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 3 + store <2 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2 + %out.subvec4.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 4 + store <2 x i8> %in.subvec, ptr %out.subvec4.ptr, align 8 + %out.subvec5.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 5 + store <2 x i8> %in.subvec, ptr %out.subvec5.ptr, align 2 + %out.subvec6.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 6 + store <2 x i8> %in.subvec, ptr %out.subvec6.ptr, align 4 + %out.subvec7.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 7 + store <2 x i8> %in.subvec, ptr %out.subvec7.ptr, align 2 + %out.subvec8.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 8 + store <2 x i8> %in.subvec, ptr %out.subvec8.ptr, align 16 + %out.subvec9.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 9 + store <2 x i8> %in.subvec, ptr %out.subvec9.ptr, align 2 + %out.subvec10.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 10 + store <2 x i8> %in.subvec, ptr %out.subvec10.ptr, align 4 + %out.subvec11.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 11 + store <2 x i8> %in.subvec, ptr %out.subvec11.ptr, align 2 + %out.subvec12.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 12 + store <2 x i8> %in.subvec, ptr %out.subvec12.ptr, align 8 + %out.subvec13.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 13 + store <2 x i8> %in.subvec, ptr %out.subvec13.ptr, align 2 + %out.subvec14.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 14 + store <2 x i8> %in.subvec, ptr %out.subvec14.ptr, align 4 + %out.subvec15.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 15 + store <2 x i8> %in.subvec, ptr %out.subvec15.ptr, align 2 + %out.subvec16.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 16 + store <2 x i8> %in.subvec, ptr %out.subvec16.ptr, align 32 + %out.subvec17.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 17 + store <2 x i8> %in.subvec, ptr %out.subvec17.ptr, align 2 + %out.subvec18.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 18 + store <2 x i8> %in.subvec, ptr %out.subvec18.ptr, align 4 + %out.subvec19.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 19 + store <2 x i8> %in.subvec, ptr %out.subvec19.ptr, align 2 + %out.subvec20.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 20 + store <2 x i8> %in.subvec, ptr %out.subvec20.ptr, align 8 + %out.subvec21.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 21 + store <2 x i8> %in.subvec, ptr %out.subvec21.ptr, align 2 + %out.subvec22.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 22 + store <2 x i8> %in.subvec, ptr %out.subvec22.ptr, align 4 + %out.subvec23.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 23 + store <2 x i8> %in.subvec, ptr %out.subvec23.ptr, align 2 + %out.subvec24.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 24 + store <2 x i8> %in.subvec, ptr %out.subvec24.ptr, align 16 + %out.subvec25.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 25 + store <2 x i8> %in.subvec, ptr %out.subvec25.ptr, align 2 + %out.subvec26.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 26 + store <2 x i8> %in.subvec, ptr %out.subvec26.ptr, align 4 + %out.subvec27.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 27 + store <2 x i8> %in.subvec, ptr %out.subvec27.ptr, align 2 + %out.subvec28.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 28 + store <2 x i8> %in.subvec, ptr %out.subvec28.ptr, align 8 + %out.subvec29.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 29 + store <2 x i8> %in.subvec, ptr %out.subvec29.ptr, align 2 + %out.subvec30.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 30 + store <2 x i8> %in.subvec, ptr %out.subvec30.ptr, align 4 + %out.subvec31.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 31 + store <2 x i8> %in.subvec, ptr %out.subvec31.ptr, align 2 + ret void +} + +define void @vec512_v2i16(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec512_v2i16: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subw (%rdi), %cx +; SCALAR-NEXT: subw 2(%rdi), %ax +; SCALAR-NEXT: movw %ax, 2(%rsi) +; SCALAR-NEXT: movw %cx, (%rsi) +; SCALAR-NEXT: movw %ax, 6(%rsi) +; SCALAR-NEXT: movw %cx, 4(%rsi) +; SCALAR-NEXT: movw %ax, 10(%rsi) +; SCALAR-NEXT: movw %cx, 8(%rsi) +; SCALAR-NEXT: movw %ax, 14(%rsi) +; SCALAR-NEXT: movw %cx, 12(%rsi) +; SCALAR-NEXT: movw %ax, 18(%rsi) +; SCALAR-NEXT: movw %cx, 16(%rsi) +; SCALAR-NEXT: movw %ax, 22(%rsi) +; SCALAR-NEXT: movw %cx, 20(%rsi) +; SCALAR-NEXT: movw %ax, 26(%rsi) +; SCALAR-NEXT: movw %cx, 24(%rsi) +; SCALAR-NEXT: movw %ax, 30(%rsi) +; SCALAR-NEXT: movw %cx, 28(%rsi) +; SCALAR-NEXT: movw %ax, 34(%rsi) +; SCALAR-NEXT: movw %cx, 32(%rsi) +; SCALAR-NEXT: movw %ax, 38(%rsi) +; SCALAR-NEXT: movw %cx, 36(%rsi) +; SCALAR-NEXT: movw %ax, 42(%rsi) +; SCALAR-NEXT: movw %cx, 40(%rsi) +; SCALAR-NEXT: movw %ax, 46(%rsi) +; SCALAR-NEXT: movw %cx, 44(%rsi) +; SCALAR-NEXT: movw %ax, 50(%rsi) +; SCALAR-NEXT: movw %cx, 48(%rsi) +; SCALAR-NEXT: movw %ax, 54(%rsi) +; SCALAR-NEXT: movw %cx, 52(%rsi) +; SCALAR-NEXT: movw %ax, 58(%rsi) +; SCALAR-NEXT: movw %cx, 56(%rsi) +; SCALAR-NEXT: movw %ax, 62(%rsi) +; SCALAR-NEXT: movw %cx, 60(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec512_v2i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,0,0] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm1, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, 32(%rsi) +; SSE2-NEXT: movdqa %xmm1, 48(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec512_v2i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,0,0,0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-ONLY-LABEL: vec512_v2i16: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpsubw (%rdi), %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rsi) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq +; +; AVX512-LABEL: vec512_v2i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpsubw (%rdi), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastd %xmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %in.subvec.neg = load <2 x i16>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <2 x i16> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 0 + store <2 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 1 + store <2 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4 + %out.subvec2.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 2 + store <2 x i16> %in.subvec, ptr %out.subvec2.ptr, align 8 + %out.subvec3.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 3 + store <2 x i16> %in.subvec, ptr %out.subvec3.ptr, align 4 + %out.subvec4.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 4 + store <2 x i16> %in.subvec, ptr %out.subvec4.ptr, align 16 + %out.subvec5.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 5 + store <2 x i16> %in.subvec, ptr %out.subvec5.ptr, align 4 + %out.subvec6.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 6 + store <2 x i16> %in.subvec, ptr %out.subvec6.ptr, align 8 + %out.subvec7.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 7 + store <2 x i16> %in.subvec, ptr %out.subvec7.ptr, align 4 + %out.subvec8.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 8 + store <2 x i16> %in.subvec, ptr %out.subvec8.ptr, align 32 + %out.subvec9.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 9 + store <2 x i16> %in.subvec, ptr %out.subvec9.ptr, align 4 + %out.subvec10.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 10 + store <2 x i16> %in.subvec, ptr %out.subvec10.ptr, align 8 + %out.subvec11.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 11 + store <2 x i16> %in.subvec, ptr %out.subvec11.ptr, align 4 + %out.subvec12.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 12 + store <2 x i16> %in.subvec, ptr %out.subvec12.ptr, align 16 + %out.subvec13.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 13 + store <2 x i16> %in.subvec, ptr %out.subvec13.ptr, align 4 + %out.subvec14.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 14 + store <2 x i16> %in.subvec, ptr %out.subvec14.ptr, align 8 + %out.subvec15.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 15 + store <2 x i16> %in.subvec, ptr %out.subvec15.ptr, align 4 + ret void +} + +define void @vec512_v2i32(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec512_v2i32: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subl (%rdi), %ecx +; SCALAR-NEXT: subl 4(%rdi), %eax +; SCALAR-NEXT: movl %eax, 4(%rsi) +; SCALAR-NEXT: movl %ecx, (%rsi) +; SCALAR-NEXT: movl %eax, 12(%rsi) +; SCALAR-NEXT: movl %ecx, 8(%rsi) +; SCALAR-NEXT: movl %eax, 20(%rsi) +; SCALAR-NEXT: movl %ecx, 16(%rsi) +; SCALAR-NEXT: movl %eax, 28(%rsi) +; SCALAR-NEXT: movl %ecx, 24(%rsi) +; SCALAR-NEXT: movl %eax, 36(%rsi) +; SCALAR-NEXT: movl %ecx, 32(%rsi) +; SCALAR-NEXT: movl %eax, 44(%rsi) +; SCALAR-NEXT: movl %ecx, 40(%rsi) +; SCALAR-NEXT: movl %eax, 52(%rsi) +; SCALAR-NEXT: movl %ecx, 48(%rsi) +; SCALAR-NEXT: movl %eax, 60(%rsi) +; SCALAR-NEXT: movl %ecx, 56(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec512_v2i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubd %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm1, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, 32(%rsi) +; SSE2-NEXT: movdqa %xmm1, 48(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec512_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-ONLY-LABEL: vec512_v2i32: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rsi) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq +; +; AVX512-LABEL: vec512_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %in.subvec.neg = load <2 x i32>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <2 x i32> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 0 + store <2 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 1 + store <2 x i32> %in.subvec, ptr %out.subvec1.ptr, align 8 + %out.subvec2.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 2 + store <2 x i32> %in.subvec, ptr %out.subvec2.ptr, align 16 + %out.subvec3.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 3 + store <2 x i32> %in.subvec, ptr %out.subvec3.ptr, align 8 + %out.subvec4.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 4 + store <2 x i32> %in.subvec, ptr %out.subvec4.ptr, align 32 + %out.subvec5.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 5 + store <2 x i32> %in.subvec, ptr %out.subvec5.ptr, align 8 + %out.subvec6.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 6 + store <2 x i32> %in.subvec, ptr %out.subvec6.ptr, align 16 + %out.subvec7.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 7 + store <2 x i32> %in.subvec, ptr %out.subvec7.ptr, align 8 + ret void +} + +define void @vec512_v2f32(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec512_v2f32: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subl (%rdi), %ecx +; SCALAR-NEXT: subl 4(%rdi), %eax +; SCALAR-NEXT: movl %eax, 4(%rsi) +; SCALAR-NEXT: movl %ecx, (%rsi) +; SCALAR-NEXT: movl %eax, 12(%rsi) +; SCALAR-NEXT: movl %ecx, 8(%rsi) +; SCALAR-NEXT: movl %eax, 20(%rsi) +; SCALAR-NEXT: movl %ecx, 16(%rsi) +; SCALAR-NEXT: movl %eax, 28(%rsi) +; SCALAR-NEXT: movl %ecx, 24(%rsi) +; SCALAR-NEXT: movl %eax, 36(%rsi) +; SCALAR-NEXT: movl %ecx, 32(%rsi) +; SCALAR-NEXT: movl %eax, 44(%rsi) +; SCALAR-NEXT: movl %ecx, 40(%rsi) +; SCALAR-NEXT: movl %eax, 52(%rsi) +; SCALAR-NEXT: movl %ecx, 48(%rsi) +; SCALAR-NEXT: movl %eax, 60(%rsi) +; SCALAR-NEXT: movl %ecx, 56(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec512_v2f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubd %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm1, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, 32(%rsi) +; SSE2-NEXT: movdqa %xmm1, 48(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec512_v2f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-ONLY-LABEL: vec512_v2f32: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rsi) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq +; +; AVX512-LABEL: vec512_v2f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %in.subvec.neg = load <2 x i32>, ptr %in.subvec.ptr, align 64 + %in.subvec.int = sub <2 x i32> zeroinitializer, %in.subvec.neg + %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float> + %out.subvec0.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 0 + store <2 x float> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 1 + store <2 x float> %in.subvec, ptr %out.subvec1.ptr, align 8 + %out.subvec2.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 2 + store <2 x float> %in.subvec, ptr %out.subvec2.ptr, align 16 + %out.subvec3.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 3 + store <2 x float> %in.subvec, ptr %out.subvec3.ptr, align 8 + %out.subvec4.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 4 + store <2 x float> %in.subvec, ptr %out.subvec4.ptr, align 32 + %out.subvec5.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 5 + store <2 x float> %in.subvec, ptr %out.subvec5.ptr, align 8 + %out.subvec6.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 6 + store <2 x float> %in.subvec, ptr %out.subvec6.ptr, align 16 + %out.subvec7.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 7 + store <2 x float> %in.subvec, ptr %out.subvec7.ptr, align 8 + ret void +} + +define void @vec512_v2i64(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec512_v2i64: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subq (%rdi), %rcx +; SCALAR-NEXT: subq 8(%rdi), %rax +; SCALAR-NEXT: movq %rax, 8(%rsi) +; SCALAR-NEXT: movq %rcx, (%rsi) +; SCALAR-NEXT: movq %rax, 24(%rsi) +; SCALAR-NEXT: movq %rcx, 16(%rsi) +; SCALAR-NEXT: movq %rax, 40(%rsi) +; SCALAR-NEXT: movq %rcx, 32(%rsi) +; SCALAR-NEXT: movq %rax, 56(%rsi) +; SCALAR-NEXT: movq %rcx, 48(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec512_v2i64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psubq (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: movdqa %xmm0, 32(%rsi) +; SSE2-NEXT: movdqa %xmm0, 48(%rsi) +; SSE2-NEXT: retq +; +; AVX-LABEL: vec512_v2i64: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsubq (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX-NEXT: vmovdqa %xmm0, 48(%rsi) +; AVX-NEXT: retq + %in.subvec.neg = load <2 x i64>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <2 x i64> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 0 + store <2 x i64> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 1 + store <2 x i64> %in.subvec, ptr %out.subvec1.ptr, align 16 + %out.subvec2.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 2 + store <2 x i64> %in.subvec, ptr %out.subvec2.ptr, align 32 + %out.subvec3.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 3 + store <2 x i64> %in.subvec, ptr %out.subvec3.ptr, align 16 + ret void +} + +define void @vec512_v2f64(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec512_v2f64: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subq (%rdi), %rcx +; SCALAR-NEXT: subq 8(%rdi), %rax +; SCALAR-NEXT: movq %rax, 8(%rsi) +; SCALAR-NEXT: movq %rcx, (%rsi) +; SCALAR-NEXT: movq %rax, 24(%rsi) +; SCALAR-NEXT: movq %rcx, 16(%rsi) +; SCALAR-NEXT: movq %rax, 40(%rsi) +; SCALAR-NEXT: movq %rcx, 32(%rsi) +; SCALAR-NEXT: movq %rax, 56(%rsi) +; SCALAR-NEXT: movq %rcx, 48(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec512_v2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psubq (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: movdqa %xmm0, 32(%rsi) +; SSE2-NEXT: movdqa %xmm0, 48(%rsi) +; SSE2-NEXT: retq +; +; AVX-LABEL: vec512_v2f64: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsubq (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX-NEXT: vmovdqa %xmm0, 48(%rsi) +; AVX-NEXT: retq + %in.subvec.neg = load <2 x i64>, ptr %in.subvec.ptr, align 64 + %in.subvec.int = sub <2 x i64> zeroinitializer, %in.subvec.neg + %in.subvec = bitcast <2 x i64> %in.subvec.int to <2 x double> + %out.subvec0.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 0 + store <2 x double> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 1 + store <2 x double> %in.subvec, ptr %out.subvec1.ptr, align 16 + %out.subvec2.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 2 + store <2 x double> %in.subvec, ptr %out.subvec2.ptr, align 32 + %out.subvec3.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 3 + store <2 x double> %in.subvec, ptr %out.subvec3.ptr, align 16 + ret void +} + +define void @vec512_v2i128(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; ALL-LABEL: vec512_v2i128: +; ALL: # %bb.0: +; ALL-NEXT: xorl %eax, %eax +; ALL-NEXT: xorl %ecx, %ecx +; ALL-NEXT: subq (%rdi), %rcx +; ALL-NEXT: movl $0, %edx +; ALL-NEXT: sbbq 8(%rdi), %rdx +; ALL-NEXT: xorl %r8d, %r8d +; ALL-NEXT: subq 16(%rdi), %r8 +; ALL-NEXT: sbbq 24(%rdi), %rax +; ALL-NEXT: movq %r8, 16(%rsi) +; ALL-NEXT: movq %rcx, (%rsi) +; ALL-NEXT: movq %rax, 24(%rsi) +; ALL-NEXT: movq %rdx, 8(%rsi) +; ALL-NEXT: movq %r8, 48(%rsi) +; ALL-NEXT: movq %rax, 56(%rsi) +; ALL-NEXT: movq %rcx, 32(%rsi) +; ALL-NEXT: movq %rdx, 40(%rsi) +; ALL-NEXT: retq + %in.subvec.neg = load <2 x i128>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <2 x i128> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <2 x i128>, ptr %out.vec.ptr, i64 0 + store <2 x i128> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <2 x i128>, ptr %out.vec.ptr, i64 1 + store <2 x i128> %in.subvec, ptr %out.subvec1.ptr, align 32 + ret void +} + +define void @vec512_v4i8(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec512_v4i8: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb (%rdi), %al +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: subb 1(%rdi), %dl +; SCALAR-NEXT: xorl %r8d, %r8d +; SCALAR-NEXT: subb 2(%rdi), %r8b +; SCALAR-NEXT: subb 3(%rdi), %cl +; SCALAR-NEXT: movb %cl, 3(%rsi) +; SCALAR-NEXT: movb %r8b, 2(%rsi) +; SCALAR-NEXT: movb %dl, 1(%rsi) +; SCALAR-NEXT: movb %al, (%rsi) +; SCALAR-NEXT: movb %cl, 7(%rsi) +; SCALAR-NEXT: movb %r8b, 6(%rsi) +; SCALAR-NEXT: movb %dl, 5(%rsi) +; SCALAR-NEXT: movb %al, 4(%rsi) +; SCALAR-NEXT: movb %cl, 11(%rsi) +; SCALAR-NEXT: movb %r8b, 10(%rsi) +; SCALAR-NEXT: movb %dl, 9(%rsi) +; SCALAR-NEXT: movb %al, 8(%rsi) +; SCALAR-NEXT: movb %cl, 15(%rsi) +; SCALAR-NEXT: movb %r8b, 14(%rsi) +; SCALAR-NEXT: movb %dl, 13(%rsi) +; SCALAR-NEXT: movb %al, 12(%rsi) +; SCALAR-NEXT: movb %cl, 19(%rsi) +; SCALAR-NEXT: movb %r8b, 18(%rsi) +; SCALAR-NEXT: movb %dl, 17(%rsi) +; SCALAR-NEXT: movb %al, 16(%rsi) +; SCALAR-NEXT: movb %cl, 23(%rsi) +; SCALAR-NEXT: movb %r8b, 22(%rsi) +; SCALAR-NEXT: movb %dl, 21(%rsi) +; SCALAR-NEXT: movb %al, 20(%rsi) +; SCALAR-NEXT: movb %cl, 27(%rsi) +; SCALAR-NEXT: movb %r8b, 26(%rsi) +; SCALAR-NEXT: movb %dl, 25(%rsi) +; SCALAR-NEXT: movb %al, 24(%rsi) +; SCALAR-NEXT: movb %cl, 31(%rsi) +; SCALAR-NEXT: movb %r8b, 30(%rsi) +; SCALAR-NEXT: movb %dl, 29(%rsi) +; SCALAR-NEXT: movb %al, 28(%rsi) +; SCALAR-NEXT: movb %cl, 35(%rsi) +; SCALAR-NEXT: movb %r8b, 34(%rsi) +; SCALAR-NEXT: movb %dl, 33(%rsi) +; SCALAR-NEXT: movb %al, 32(%rsi) +; SCALAR-NEXT: movb %cl, 39(%rsi) +; SCALAR-NEXT: movb %r8b, 38(%rsi) +; SCALAR-NEXT: movb %dl, 37(%rsi) +; SCALAR-NEXT: movb %al, 36(%rsi) +; SCALAR-NEXT: movb %cl, 43(%rsi) +; SCALAR-NEXT: movb %r8b, 42(%rsi) +; SCALAR-NEXT: movb %dl, 41(%rsi) +; SCALAR-NEXT: movb %al, 40(%rsi) +; SCALAR-NEXT: movb %cl, 47(%rsi) +; SCALAR-NEXT: movb %r8b, 46(%rsi) +; SCALAR-NEXT: movb %dl, 45(%rsi) +; SCALAR-NEXT: movb %al, 44(%rsi) +; SCALAR-NEXT: movb %cl, 51(%rsi) +; SCALAR-NEXT: movb %r8b, 50(%rsi) +; SCALAR-NEXT: movb %dl, 49(%rsi) +; SCALAR-NEXT: movb %al, 48(%rsi) +; SCALAR-NEXT: movb %cl, 55(%rsi) +; SCALAR-NEXT: movb %r8b, 54(%rsi) +; SCALAR-NEXT: movb %dl, 53(%rsi) +; SCALAR-NEXT: movb %al, 52(%rsi) +; SCALAR-NEXT: movb %cl, 59(%rsi) +; SCALAR-NEXT: movb %r8b, 58(%rsi) +; SCALAR-NEXT: movb %dl, 57(%rsi) +; SCALAR-NEXT: movb %al, 56(%rsi) +; SCALAR-NEXT: movb %cl, 63(%rsi) +; SCALAR-NEXT: movb %r8b, 62(%rsi) +; SCALAR-NEXT: movb %dl, 61(%rsi) +; SCALAR-NEXT: movb %al, 60(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec512_v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,0,0] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm1, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, 32(%rsi) +; SSE2-NEXT: movdqa %xmm1, 48(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec512_v4i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,0,0,0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-ONLY-LABEL: vec512_v4i8: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpsubb (%rdi), %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rsi) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq +; +; AVX512-LABEL: vec512_v4i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpsubb (%rdi), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastd %xmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %in.subvec.neg = load <4 x i8>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <4 x i8> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 0 + store <4 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 1 + store <4 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4 + %out.subvec2.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 2 + store <4 x i8> %in.subvec, ptr %out.subvec2.ptr, align 8 + %out.subvec3.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 3 + store <4 x i8> %in.subvec, ptr %out.subvec3.ptr, align 4 + %out.subvec4.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 4 + store <4 x i8> %in.subvec, ptr %out.subvec4.ptr, align 16 + %out.subvec5.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 5 + store <4 x i8> %in.subvec, ptr %out.subvec5.ptr, align 4 + %out.subvec6.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 6 + store <4 x i8> %in.subvec, ptr %out.subvec6.ptr, align 8 + %out.subvec7.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 7 + store <4 x i8> %in.subvec, ptr %out.subvec7.ptr, align 4 + %out.subvec8.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 8 + store <4 x i8> %in.subvec, ptr %out.subvec8.ptr, align 32 + %out.subvec9.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 9 + store <4 x i8> %in.subvec, ptr %out.subvec9.ptr, align 4 + %out.subvec10.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 10 + store <4 x i8> %in.subvec, ptr %out.subvec10.ptr, align 8 + %out.subvec11.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 11 + store <4 x i8> %in.subvec, ptr %out.subvec11.ptr, align 4 + %out.subvec12.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 12 + store <4 x i8> %in.subvec, ptr %out.subvec12.ptr, align 16 + %out.subvec13.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 13 + store <4 x i8> %in.subvec, ptr %out.subvec13.ptr, align 4 + %out.subvec14.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 14 + store <4 x i8> %in.subvec, ptr %out.subvec14.ptr, align 8 + %out.subvec15.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 15 + store <4 x i8> %in.subvec, ptr %out.subvec15.ptr, align 4 + ret void +} + +define void @vec512_v4i16(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec512_v4i16: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subw (%rdi), %ax +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: subw 2(%rdi), %dx +; SCALAR-NEXT: xorl %r8d, %r8d +; SCALAR-NEXT: subw 4(%rdi), %r8w +; SCALAR-NEXT: subw 6(%rdi), %cx +; SCALAR-NEXT: movw %cx, 6(%rsi) +; SCALAR-NEXT: movw %r8w, 4(%rsi) +; SCALAR-NEXT: movw %dx, 2(%rsi) +; SCALAR-NEXT: movw %ax, (%rsi) +; SCALAR-NEXT: movw %cx, 14(%rsi) +; SCALAR-NEXT: movw %r8w, 12(%rsi) +; SCALAR-NEXT: movw %dx, 10(%rsi) +; SCALAR-NEXT: movw %ax, 8(%rsi) +; SCALAR-NEXT: movw %cx, 22(%rsi) +; SCALAR-NEXT: movw %r8w, 20(%rsi) +; SCALAR-NEXT: movw %dx, 18(%rsi) +; SCALAR-NEXT: movw %ax, 16(%rsi) +; SCALAR-NEXT: movw %cx, 30(%rsi) +; SCALAR-NEXT: movw %r8w, 28(%rsi) +; SCALAR-NEXT: movw %dx, 26(%rsi) +; SCALAR-NEXT: movw %ax, 24(%rsi) +; SCALAR-NEXT: movw %cx, 38(%rsi) +; SCALAR-NEXT: movw %r8w, 36(%rsi) +; SCALAR-NEXT: movw %dx, 34(%rsi) +; SCALAR-NEXT: movw %ax, 32(%rsi) +; SCALAR-NEXT: movw %cx, 46(%rsi) +; SCALAR-NEXT: movw %r8w, 44(%rsi) +; SCALAR-NEXT: movw %dx, 42(%rsi) +; SCALAR-NEXT: movw %ax, 40(%rsi) +; SCALAR-NEXT: movw %cx, 54(%rsi) +; SCALAR-NEXT: movw %r8w, 52(%rsi) +; SCALAR-NEXT: movw %dx, 50(%rsi) +; SCALAR-NEXT: movw %ax, 48(%rsi) +; SCALAR-NEXT: movw %cx, 62(%rsi) +; SCALAR-NEXT: movw %r8w, 60(%rsi) +; SCALAR-NEXT: movw %dx, 58(%rsi) +; SCALAR-NEXT: movw %ax, 56(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec512_v4i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm1, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, 32(%rsi) +; SSE2-NEXT: movdqa %xmm1, 48(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec512_v4i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-ONLY-LABEL: vec512_v4i16: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rsi) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq +; +; AVX512-LABEL: vec512_v4i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %in.subvec.neg = load <4 x i16>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <4 x i16> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 0 + store <4 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 1 + store <4 x i16> %in.subvec, ptr %out.subvec1.ptr, align 8 + %out.subvec2.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 2 + store <4 x i16> %in.subvec, ptr %out.subvec2.ptr, align 16 + %out.subvec3.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 3 + store <4 x i16> %in.subvec, ptr %out.subvec3.ptr, align 8 + %out.subvec4.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 4 + store <4 x i16> %in.subvec, ptr %out.subvec4.ptr, align 32 + %out.subvec5.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 5 + store <4 x i16> %in.subvec, ptr %out.subvec5.ptr, align 8 + %out.subvec6.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 6 + store <4 x i16> %in.subvec, ptr %out.subvec6.ptr, align 16 + %out.subvec7.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 7 + store <4 x i16> %in.subvec, ptr %out.subvec7.ptr, align 8 + ret void +} + +define void @vec512_v4i32(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec512_v4i32: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subl (%rdi), %ecx +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: subl 4(%rdi), %edx +; SCALAR-NEXT: xorl %r8d, %r8d +; SCALAR-NEXT: subl 8(%rdi), %r8d +; SCALAR-NEXT: subl 12(%rdi), %eax +; SCALAR-NEXT: movl %eax, 12(%rsi) +; SCALAR-NEXT: movl %r8d, 8(%rsi) +; SCALAR-NEXT: movl %edx, 4(%rsi) +; SCALAR-NEXT: movl %ecx, (%rsi) +; SCALAR-NEXT: movl %eax, 28(%rsi) +; SCALAR-NEXT: movl %r8d, 24(%rsi) +; SCALAR-NEXT: movl %edx, 20(%rsi) +; SCALAR-NEXT: movl %ecx, 16(%rsi) +; SCALAR-NEXT: movl %eax, 44(%rsi) +; SCALAR-NEXT: movl %r8d, 40(%rsi) +; SCALAR-NEXT: movl %edx, 36(%rsi) +; SCALAR-NEXT: movl %ecx, 32(%rsi) +; SCALAR-NEXT: movl %eax, 60(%rsi) +; SCALAR-NEXT: movl %r8d, 56(%rsi) +; SCALAR-NEXT: movl %edx, 52(%rsi) +; SCALAR-NEXT: movl %ecx, 48(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec512_v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psubd (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: movdqa %xmm0, 32(%rsi) +; SSE2-NEXT: movdqa %xmm0, 48(%rsi) +; SSE2-NEXT: retq +; +; AVX-LABEL: vec512_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsubd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX-NEXT: vmovdqa %xmm0, 48(%rsi) +; AVX-NEXT: retq + %in.subvec.neg = load <4 x i32>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <4 x i32> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 0 + store <4 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 1 + store <4 x i32> %in.subvec, ptr %out.subvec1.ptr, align 16 + %out.subvec2.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 2 + store <4 x i32> %in.subvec, ptr %out.subvec2.ptr, align 32 + %out.subvec3.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 3 + store <4 x i32> %in.subvec, ptr %out.subvec3.ptr, align 16 + ret void +} + +define void @vec512_v4f32(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec512_v4f32: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subl (%rdi), %ecx +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: subl 4(%rdi), %edx +; SCALAR-NEXT: xorl %r8d, %r8d +; SCALAR-NEXT: subl 8(%rdi), %r8d +; SCALAR-NEXT: subl 12(%rdi), %eax +; SCALAR-NEXT: movl %eax, 12(%rsi) +; SCALAR-NEXT: movl %r8d, 8(%rsi) +; SCALAR-NEXT: movl %edx, 4(%rsi) +; SCALAR-NEXT: movl %ecx, (%rsi) +; SCALAR-NEXT: movl %eax, 28(%rsi) +; SCALAR-NEXT: movl %r8d, 24(%rsi) +; SCALAR-NEXT: movl %edx, 20(%rsi) +; SCALAR-NEXT: movl %ecx, 16(%rsi) +; SCALAR-NEXT: movl %eax, 44(%rsi) +; SCALAR-NEXT: movl %r8d, 40(%rsi) +; SCALAR-NEXT: movl %edx, 36(%rsi) +; SCALAR-NEXT: movl %ecx, 32(%rsi) +; SCALAR-NEXT: movl %eax, 60(%rsi) +; SCALAR-NEXT: movl %r8d, 56(%rsi) +; SCALAR-NEXT: movl %edx, 52(%rsi) +; SCALAR-NEXT: movl %ecx, 48(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec512_v4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psubd (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: movdqa %xmm0, 32(%rsi) +; SSE2-NEXT: movdqa %xmm0, 48(%rsi) +; SSE2-NEXT: retq +; +; AVX-LABEL: vec512_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsubd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX-NEXT: vmovdqa %xmm0, 48(%rsi) +; AVX-NEXT: retq + %in.subvec.neg = load <4 x i32>, ptr %in.subvec.ptr, align 64 + %in.subvec.int = sub <4 x i32> zeroinitializer, %in.subvec.neg + %in.subvec = bitcast <4 x i32> %in.subvec.int to <4 x float> + %out.subvec0.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 0 + store <4 x float> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 1 + store <4 x float> %in.subvec, ptr %out.subvec1.ptr, align 16 + %out.subvec2.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 2 + store <4 x float> %in.subvec, ptr %out.subvec2.ptr, align 32 + %out.subvec3.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 3 + store <4 x float> %in.subvec, ptr %out.subvec3.ptr, align 16 + ret void +} + +define void @vec512_v4i64(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec512_v4i64: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subq (%rdi), %rcx +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: subq 8(%rdi), %rdx +; SCALAR-NEXT: xorl %r8d, %r8d +; SCALAR-NEXT: subq 16(%rdi), %r8 +; SCALAR-NEXT: subq 24(%rdi), %rax +; SCALAR-NEXT: movq %rax, 24(%rsi) +; SCALAR-NEXT: movq %r8, 16(%rsi) +; SCALAR-NEXT: movq %rdx, 8(%rsi) +; SCALAR-NEXT: movq %rcx, (%rsi) +; SCALAR-NEXT: movq %rax, 56(%rsi) +; SCALAR-NEXT: movq %r8, 48(%rsi) +; SCALAR-NEXT: movq %rdx, 40(%rsi) +; SCALAR-NEXT: movq %rcx, 32(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec512_v4i64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubq 16(%rdi), %xmm1 +; SSE2-NEXT: psubq (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movdqa %xmm1, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, 48(%rsi) +; SSE2-NEXT: movdqa %xmm0, 32(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec512_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq 16(%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpsubq (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec512_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %in.subvec.neg = load <4 x i64>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <4 x i64> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <4 x i64>, ptr %out.vec.ptr, i64 0 + store <4 x i64> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <4 x i64>, ptr %out.vec.ptr, i64 1 + store <4 x i64> %in.subvec, ptr %out.subvec1.ptr, align 32 + ret void +} + +define void @vec512_v4f64(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec512_v4f64: +; SCALAR: # %bb.0: +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subq (%rdi), %rcx +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: subq 8(%rdi), %rdx +; SCALAR-NEXT: xorl %r8d, %r8d +; SCALAR-NEXT: subq 16(%rdi), %r8 +; SCALAR-NEXT: subq 24(%rdi), %rax +; SCALAR-NEXT: movq %rax, 24(%rsi) +; SCALAR-NEXT: movq %r8, 16(%rsi) +; SCALAR-NEXT: movq %rdx, 8(%rsi) +; SCALAR-NEXT: movq %rcx, (%rsi) +; SCALAR-NEXT: movq %rax, 56(%rsi) +; SCALAR-NEXT: movq %r8, 48(%rsi) +; SCALAR-NEXT: movq %rdx, 40(%rsi) +; SCALAR-NEXT: movq %rcx, 32(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec512_v4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubq 16(%rdi), %xmm1 +; SSE2-NEXT: psubq (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movdqa %xmm1, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, 48(%rsi) +; SSE2-NEXT: movdqa %xmm0, 32(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec512_v4f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq 16(%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpsubq (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec512_v4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %in.subvec.neg = load <4 x i64>, ptr %in.subvec.ptr, align 64 + %in.subvec.int = sub <4 x i64> zeroinitializer, %in.subvec.neg + %in.subvec = bitcast <4 x i64> %in.subvec.int to <4 x double> + %out.subvec0.ptr = getelementptr <4 x double>, ptr %out.vec.ptr, i64 0 + store <4 x double> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <4 x double>, ptr %out.vec.ptr, i64 1 + store <4 x double> %in.subvec, ptr %out.subvec1.ptr, align 32 + ret void +} + +define void @vec512_v8i8(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec512_v8i8: +; SCALAR: # %bb.0: +; SCALAR-NEXT: pushq %rbx +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb (%rdi), %al +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subb 1(%rdi), %cl +; SCALAR-NEXT: xorl %r8d, %r8d +; SCALAR-NEXT: subb 2(%rdi), %r8b +; SCALAR-NEXT: xorl %r9d, %r9d +; SCALAR-NEXT: subb 3(%rdi), %r9b +; SCALAR-NEXT: xorl %r10d, %r10d +; SCALAR-NEXT: subb 4(%rdi), %r10b +; SCALAR-NEXT: xorl %r11d, %r11d +; SCALAR-NEXT: subb 5(%rdi), %r11b +; SCALAR-NEXT: xorl %ebx, %ebx +; SCALAR-NEXT: subb 6(%rdi), %bl +; SCALAR-NEXT: subb 7(%rdi), %dl +; SCALAR-NEXT: movb %dl, 7(%rsi) +; SCALAR-NEXT: movb %bl, 6(%rsi) +; SCALAR-NEXT: movb %r11b, 5(%rsi) +; SCALAR-NEXT: movb %r10b, 4(%rsi) +; SCALAR-NEXT: movb %r9b, 3(%rsi) +; SCALAR-NEXT: movb %r8b, 2(%rsi) +; SCALAR-NEXT: movb %cl, 1(%rsi) +; SCALAR-NEXT: movb %al, (%rsi) +; SCALAR-NEXT: movb %dl, 15(%rsi) +; SCALAR-NEXT: movb %bl, 14(%rsi) +; SCALAR-NEXT: movb %r11b, 13(%rsi) +; SCALAR-NEXT: movb %r10b, 12(%rsi) +; SCALAR-NEXT: movb %r9b, 11(%rsi) +; SCALAR-NEXT: movb %r8b, 10(%rsi) +; SCALAR-NEXT: movb %cl, 9(%rsi) +; SCALAR-NEXT: movb %al, 8(%rsi) +; SCALAR-NEXT: movb %dl, 23(%rsi) +; SCALAR-NEXT: movb %bl, 22(%rsi) +; SCALAR-NEXT: movb %r11b, 21(%rsi) +; SCALAR-NEXT: movb %r10b, 20(%rsi) +; SCALAR-NEXT: movb %r9b, 19(%rsi) +; SCALAR-NEXT: movb %r8b, 18(%rsi) +; SCALAR-NEXT: movb %cl, 17(%rsi) +; SCALAR-NEXT: movb %al, 16(%rsi) +; SCALAR-NEXT: movb %dl, 31(%rsi) +; SCALAR-NEXT: movb %bl, 30(%rsi) +; SCALAR-NEXT: movb %r11b, 29(%rsi) +; SCALAR-NEXT: movb %r10b, 28(%rsi) +; SCALAR-NEXT: movb %r9b, 27(%rsi) +; SCALAR-NEXT: movb %r8b, 26(%rsi) +; SCALAR-NEXT: movb %cl, 25(%rsi) +; SCALAR-NEXT: movb %al, 24(%rsi) +; SCALAR-NEXT: movb %dl, 39(%rsi) +; SCALAR-NEXT: movb %bl, 38(%rsi) +; SCALAR-NEXT: movb %r11b, 37(%rsi) +; SCALAR-NEXT: movb %r10b, 36(%rsi) +; SCALAR-NEXT: movb %r9b, 35(%rsi) +; SCALAR-NEXT: movb %r8b, 34(%rsi) +; SCALAR-NEXT: movb %cl, 33(%rsi) +; SCALAR-NEXT: movb %al, 32(%rsi) +; SCALAR-NEXT: movb %dl, 47(%rsi) +; SCALAR-NEXT: movb %bl, 46(%rsi) +; SCALAR-NEXT: movb %r11b, 45(%rsi) +; SCALAR-NEXT: movb %r10b, 44(%rsi) +; SCALAR-NEXT: movb %r9b, 43(%rsi) +; SCALAR-NEXT: movb %r8b, 42(%rsi) +; SCALAR-NEXT: movb %cl, 41(%rsi) +; SCALAR-NEXT: movb %al, 40(%rsi) +; SCALAR-NEXT: movb %dl, 55(%rsi) +; SCALAR-NEXT: movb %bl, 54(%rsi) +; SCALAR-NEXT: movb %r11b, 53(%rsi) +; SCALAR-NEXT: movb %r10b, 52(%rsi) +; SCALAR-NEXT: movb %r9b, 51(%rsi) +; SCALAR-NEXT: movb %r8b, 50(%rsi) +; SCALAR-NEXT: movb %cl, 49(%rsi) +; SCALAR-NEXT: movb %al, 48(%rsi) +; SCALAR-NEXT: movb %dl, 63(%rsi) +; SCALAR-NEXT: movb %bl, 62(%rsi) +; SCALAR-NEXT: movb %r11b, 61(%rsi) +; SCALAR-NEXT: movb %r10b, 60(%rsi) +; SCALAR-NEXT: movb %r9b, 59(%rsi) +; SCALAR-NEXT: movb %r8b, 58(%rsi) +; SCALAR-NEXT: movb %cl, 57(%rsi) +; SCALAR-NEXT: movb %al, 56(%rsi) +; SCALAR-NEXT: popq %rbx +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec512_v8i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm1, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, 32(%rsi) +; SSE2-NEXT: movdqa %xmm1, 48(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec512_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-ONLY-LABEL: vec512_v8i8: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rsi) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq +; +; AVX512-LABEL: vec512_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %in.subvec.neg = load <8 x i8>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <8 x i8> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 0 + store <8 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 1 + store <8 x i8> %in.subvec, ptr %out.subvec1.ptr, align 8 + %out.subvec2.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 2 + store <8 x i8> %in.subvec, ptr %out.subvec2.ptr, align 16 + %out.subvec3.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 3 + store <8 x i8> %in.subvec, ptr %out.subvec3.ptr, align 8 + %out.subvec4.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 4 + store <8 x i8> %in.subvec, ptr %out.subvec4.ptr, align 32 + %out.subvec5.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 5 + store <8 x i8> %in.subvec, ptr %out.subvec5.ptr, align 8 + %out.subvec6.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 6 + store <8 x i8> %in.subvec, ptr %out.subvec6.ptr, align 16 + %out.subvec7.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 7 + store <8 x i8> %in.subvec, ptr %out.subvec7.ptr, align 8 + ret void +} + +define void @vec512_v8i16(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec512_v8i16: +; SCALAR: # %bb.0: +; SCALAR-NEXT: pushq %rbx +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subw (%rdi), %ax +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subw 2(%rdi), %cx +; SCALAR-NEXT: xorl %r8d, %r8d +; SCALAR-NEXT: subw 4(%rdi), %r8w +; SCALAR-NEXT: xorl %r9d, %r9d +; SCALAR-NEXT: subw 6(%rdi), %r9w +; SCALAR-NEXT: xorl %r10d, %r10d +; SCALAR-NEXT: subw 8(%rdi), %r10w +; SCALAR-NEXT: xorl %r11d, %r11d +; SCALAR-NEXT: subw 10(%rdi), %r11w +; SCALAR-NEXT: xorl %ebx, %ebx +; SCALAR-NEXT: subw 12(%rdi), %bx +; SCALAR-NEXT: subw 14(%rdi), %dx +; SCALAR-NEXT: movw %dx, 14(%rsi) +; SCALAR-NEXT: movw %bx, 12(%rsi) +; SCALAR-NEXT: movw %r11w, 10(%rsi) +; SCALAR-NEXT: movw %r10w, 8(%rsi) +; SCALAR-NEXT: movw %r9w, 6(%rsi) +; SCALAR-NEXT: movw %r8w, 4(%rsi) +; SCALAR-NEXT: movw %cx, 2(%rsi) +; SCALAR-NEXT: movw %ax, (%rsi) +; SCALAR-NEXT: movw %dx, 30(%rsi) +; SCALAR-NEXT: movw %bx, 28(%rsi) +; SCALAR-NEXT: movw %r11w, 26(%rsi) +; SCALAR-NEXT: movw %r10w, 24(%rsi) +; SCALAR-NEXT: movw %r9w, 22(%rsi) +; SCALAR-NEXT: movw %r8w, 20(%rsi) +; SCALAR-NEXT: movw %cx, 18(%rsi) +; SCALAR-NEXT: movw %ax, 16(%rsi) +; SCALAR-NEXT: movw %dx, 46(%rsi) +; SCALAR-NEXT: movw %bx, 44(%rsi) +; SCALAR-NEXT: movw %r11w, 42(%rsi) +; SCALAR-NEXT: movw %r10w, 40(%rsi) +; SCALAR-NEXT: movw %r9w, 38(%rsi) +; SCALAR-NEXT: movw %r8w, 36(%rsi) +; SCALAR-NEXT: movw %cx, 34(%rsi) +; SCALAR-NEXT: movw %ax, 32(%rsi) +; SCALAR-NEXT: movw %dx, 62(%rsi) +; SCALAR-NEXT: movw %bx, 60(%rsi) +; SCALAR-NEXT: movw %r11w, 58(%rsi) +; SCALAR-NEXT: movw %r10w, 56(%rsi) +; SCALAR-NEXT: movw %r9w, 54(%rsi) +; SCALAR-NEXT: movw %r8w, 52(%rsi) +; SCALAR-NEXT: movw %cx, 50(%rsi) +; SCALAR-NEXT: movw %ax, 48(%rsi) +; SCALAR-NEXT: popq %rbx +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec512_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psubw (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: movdqa %xmm0, 32(%rsi) +; SSE2-NEXT: movdqa %xmm0, 48(%rsi) +; SSE2-NEXT: retq +; +; AVX-LABEL: vec512_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsubw (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX-NEXT: vmovdqa %xmm0, 48(%rsi) +; AVX-NEXT: retq + %in.subvec.neg = load <8 x i16>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <8 x i16> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 0 + store <8 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 1 + store <8 x i16> %in.subvec, ptr %out.subvec1.ptr, align 16 + %out.subvec2.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 2 + store <8 x i16> %in.subvec, ptr %out.subvec2.ptr, align 32 + %out.subvec3.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 3 + store <8 x i16> %in.subvec, ptr %out.subvec3.ptr, align 16 + ret void +} + +define void @vec512_v8i32(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec512_v8i32: +; SCALAR: # %bb.0: +; SCALAR-NEXT: pushq %rbx +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subl (%rdi), %eax +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: subl 4(%rdi), %edx +; SCALAR-NEXT: xorl %r8d, %r8d +; SCALAR-NEXT: subl 8(%rdi), %r8d +; SCALAR-NEXT: xorl %r9d, %r9d +; SCALAR-NEXT: subl 12(%rdi), %r9d +; SCALAR-NEXT: xorl %r10d, %r10d +; SCALAR-NEXT: subl 16(%rdi), %r10d +; SCALAR-NEXT: xorl %r11d, %r11d +; SCALAR-NEXT: subl 20(%rdi), %r11d +; SCALAR-NEXT: xorl %ebx, %ebx +; SCALAR-NEXT: subl 24(%rdi), %ebx +; SCALAR-NEXT: subl 28(%rdi), %ecx +; SCALAR-NEXT: movl %ecx, 28(%rsi) +; SCALAR-NEXT: movl %ebx, 24(%rsi) +; SCALAR-NEXT: movl %r11d, 20(%rsi) +; SCALAR-NEXT: movl %r10d, 16(%rsi) +; SCALAR-NEXT: movl %r9d, 12(%rsi) +; SCALAR-NEXT: movl %r8d, 8(%rsi) +; SCALAR-NEXT: movl %edx, 4(%rsi) +; SCALAR-NEXT: movl %eax, (%rsi) +; SCALAR-NEXT: movl %ecx, 60(%rsi) +; SCALAR-NEXT: movl %ebx, 56(%rsi) +; SCALAR-NEXT: movl %r11d, 52(%rsi) +; SCALAR-NEXT: movl %r10d, 48(%rsi) +; SCALAR-NEXT: movl %r9d, 44(%rsi) +; SCALAR-NEXT: movl %r8d, 40(%rsi) +; SCALAR-NEXT: movl %edx, 36(%rsi) +; SCALAR-NEXT: movl %eax, 32(%rsi) +; SCALAR-NEXT: popq %rbx +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec512_v8i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubd 16(%rdi), %xmm1 +; SSE2-NEXT: psubd (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movdqa %xmm1, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, 48(%rsi) +; SSE2-NEXT: movdqa %xmm0, 32(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec512_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsubd 16(%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpsubd (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec512_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpsubd (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %in.subvec.neg = load <8 x i32>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <8 x i32> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <8 x i32>, ptr %out.vec.ptr, i64 0 + store <8 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <8 x i32>, ptr %out.vec.ptr, i64 1 + store <8 x i32> %in.subvec, ptr %out.subvec1.ptr, align 32 + ret void +} + +define void @vec512_v8f32(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec512_v8f32: +; SCALAR: # %bb.0: +; SCALAR-NEXT: pushq %rbx +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subl (%rdi), %eax +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: subl 4(%rdi), %edx +; SCALAR-NEXT: xorl %r8d, %r8d +; SCALAR-NEXT: subl 8(%rdi), %r8d +; SCALAR-NEXT: xorl %r9d, %r9d +; SCALAR-NEXT: subl 12(%rdi), %r9d +; SCALAR-NEXT: xorl %r10d, %r10d +; SCALAR-NEXT: subl 16(%rdi), %r10d +; SCALAR-NEXT: xorl %r11d, %r11d +; SCALAR-NEXT: subl 20(%rdi), %r11d +; SCALAR-NEXT: xorl %ebx, %ebx +; SCALAR-NEXT: subl 24(%rdi), %ebx +; SCALAR-NEXT: subl 28(%rdi), %ecx +; SCALAR-NEXT: movl %ecx, 28(%rsi) +; SCALAR-NEXT: movl %ebx, 24(%rsi) +; SCALAR-NEXT: movl %r11d, 20(%rsi) +; SCALAR-NEXT: movl %r10d, 16(%rsi) +; SCALAR-NEXT: movl %r9d, 12(%rsi) +; SCALAR-NEXT: movl %r8d, 8(%rsi) +; SCALAR-NEXT: movl %edx, 4(%rsi) +; SCALAR-NEXT: movl %eax, (%rsi) +; SCALAR-NEXT: movl %ecx, 60(%rsi) +; SCALAR-NEXT: movl %ebx, 56(%rsi) +; SCALAR-NEXT: movl %r11d, 52(%rsi) +; SCALAR-NEXT: movl %r10d, 48(%rsi) +; SCALAR-NEXT: movl %r9d, 44(%rsi) +; SCALAR-NEXT: movl %r8d, 40(%rsi) +; SCALAR-NEXT: movl %edx, 36(%rsi) +; SCALAR-NEXT: movl %eax, 32(%rsi) +; SCALAR-NEXT: popq %rbx +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec512_v8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubd 16(%rdi), %xmm1 +; SSE2-NEXT: psubd (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movdqa %xmm1, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, 48(%rsi) +; SSE2-NEXT: movdqa %xmm0, 32(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec512_v8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsubd 16(%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpsubd (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec512_v8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpsubd (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %in.subvec.neg = load <8 x i32>, ptr %in.subvec.ptr, align 64 + %in.subvec.int = sub <8 x i32> zeroinitializer, %in.subvec.neg + %in.subvec = bitcast <8 x i32> %in.subvec.int to <8 x float> + %out.subvec0.ptr = getelementptr <8 x float>, ptr %out.vec.ptr, i64 0 + store <8 x float> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <8 x float>, ptr %out.vec.ptr, i64 1 + store <8 x float> %in.subvec, ptr %out.subvec1.ptr, align 32 + ret void +} + +define void @vec512_v16i8(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec512_v16i8: +; SCALAR: # %bb.0: +; SCALAR-NEXT: pushq %rbp +; SCALAR-NEXT: pushq %r15 +; SCALAR-NEXT: pushq %r14 +; SCALAR-NEXT: pushq %r13 +; SCALAR-NEXT: pushq %r12 +; SCALAR-NEXT: pushq %rbx +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb (%rdi), %al +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 1(%rdi), %al +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 2(%rdi), %al +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 3(%rdi), %al +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subb 4(%rdi), %cl +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: subb 5(%rdi), %dl +; SCALAR-NEXT: xorl %r8d, %r8d +; SCALAR-NEXT: subb 6(%rdi), %r8b +; SCALAR-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %r9d, %r9d +; SCALAR-NEXT: subb 7(%rdi), %r9b +; SCALAR-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %ebp, %ebp +; SCALAR-NEXT: subb 8(%rdi), %bpl +; SCALAR-NEXT: xorl %r14d, %r14d +; SCALAR-NEXT: subb 9(%rdi), %r14b +; SCALAR-NEXT: xorl %r15d, %r15d +; SCALAR-NEXT: subb 10(%rdi), %r15b +; SCALAR-NEXT: xorl %r12d, %r12d +; SCALAR-NEXT: subb 11(%rdi), %r12b +; SCALAR-NEXT: xorl %r13d, %r13d +; SCALAR-NEXT: subb 12(%rdi), %r13b +; SCALAR-NEXT: xorl %ebx, %ebx +; SCALAR-NEXT: subb 13(%rdi), %bl +; SCALAR-NEXT: xorl %r11d, %r11d +; SCALAR-NEXT: subb 14(%rdi), %r11b +; SCALAR-NEXT: xorl %r10d, %r10d +; SCALAR-NEXT: subb 15(%rdi), %r10b +; SCALAR-NEXT: movb %r10b, 15(%rsi) +; SCALAR-NEXT: movb %r11b, 14(%rsi) +; SCALAR-NEXT: movb %bl, 13(%rsi) +; SCALAR-NEXT: movb %r13b, 12(%rsi) +; SCALAR-NEXT: movb %r12b, 11(%rsi) +; SCALAR-NEXT: movb %r15b, 10(%rsi) +; SCALAR-NEXT: movb %r14b, 9(%rsi) +; SCALAR-NEXT: movb %bpl, 8(%rsi) +; SCALAR-NEXT: movb %r9b, 7(%rsi) +; SCALAR-NEXT: movb %r8b, 6(%rsi) +; SCALAR-NEXT: movb %dl, 5(%rsi) +; SCALAR-NEXT: movb %cl, 4(%rsi) +; SCALAR-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: movb %al, 3(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SCALAR-NEXT: movb %al, 2(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload +; SCALAR-NEXT: movb %r8b, 1(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; SCALAR-NEXT: movb %dil, (%rsi) +; SCALAR-NEXT: movb %r10b, 31(%rsi) +; SCALAR-NEXT: movb %r11b, 30(%rsi) +; SCALAR-NEXT: movb %bl, 29(%rsi) +; SCALAR-NEXT: movb %r13b, 28(%rsi) +; SCALAR-NEXT: movb %r12b, 27(%rsi) +; SCALAR-NEXT: movb %r15b, 26(%rsi) +; SCALAR-NEXT: movb %r14b, 25(%rsi) +; SCALAR-NEXT: movb %bpl, 24(%rsi) +; SCALAR-NEXT: movb %r9b, 23(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload +; SCALAR-NEXT: movb %r9b, 22(%rsi) +; SCALAR-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: movb %dl, 21(%rsi) +; SCALAR-NEXT: movb %cl, 20(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; SCALAR-NEXT: movb %cl, 19(%rsi) +; SCALAR-NEXT: movb %al, 18(%rsi) +; SCALAR-NEXT: movb %r8b, 17(%rsi) +; SCALAR-NEXT: movb %dil, 16(%rsi) +; SCALAR-NEXT: movl %edi, %r8d +; SCALAR-NEXT: movb %r10b, 47(%rsi) +; SCALAR-NEXT: movb %r11b, 46(%rsi) +; SCALAR-NEXT: movb %bl, 45(%rsi) +; SCALAR-NEXT: movb %r13b, 44(%rsi) +; SCALAR-NEXT: movb %r12b, 43(%rsi) +; SCALAR-NEXT: movb %r15b, 42(%rsi) +; SCALAR-NEXT: movb %r14b, 41(%rsi) +; SCALAR-NEXT: movb %bpl, 40(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; SCALAR-NEXT: movb %dil, 39(%rsi) +; SCALAR-NEXT: movb %r9b, 38(%rsi) +; SCALAR-NEXT: movb %dl, 37(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload +; SCALAR-NEXT: movb %dl, 36(%rsi) +; SCALAR-NEXT: movb %cl, 35(%rsi) +; SCALAR-NEXT: movb %al, 34(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; SCALAR-NEXT: movb %cl, 33(%rsi) +; SCALAR-NEXT: movb %r8b, 32(%rsi) +; SCALAR-NEXT: movb %r10b, 63(%rsi) +; SCALAR-NEXT: movb %r11b, 62(%rsi) +; SCALAR-NEXT: movb %bl, 61(%rsi) +; SCALAR-NEXT: movb %r13b, 60(%rsi) +; SCALAR-NEXT: movb %r12b, 59(%rsi) +; SCALAR-NEXT: movb %r15b, 58(%rsi) +; SCALAR-NEXT: movb %r14b, 57(%rsi) +; SCALAR-NEXT: movb %bpl, 56(%rsi) +; SCALAR-NEXT: movb %dil, 55(%rsi) +; SCALAR-NEXT: movb %r9b, 54(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; SCALAR-NEXT: movb %dil, 53(%rsi) +; SCALAR-NEXT: movb %dl, 52(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload +; SCALAR-NEXT: movb %dl, 51(%rsi) +; SCALAR-NEXT: movb %al, 50(%rsi) +; SCALAR-NEXT: movb %cl, 49(%rsi) +; SCALAR-NEXT: movb %r8b, 48(%rsi) +; SCALAR-NEXT: popq %rbx +; SCALAR-NEXT: popq %r12 +; SCALAR-NEXT: popq %r13 +; SCALAR-NEXT: popq %r14 +; SCALAR-NEXT: popq %r15 +; SCALAR-NEXT: popq %rbp +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec512_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psubb (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: movdqa %xmm0, 32(%rsi) +; SSE2-NEXT: movdqa %xmm0, 48(%rsi) +; SSE2-NEXT: retq +; +; AVX-LABEL: vec512_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsubb (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX-NEXT: vmovdqa %xmm0, 48(%rsi) +; AVX-NEXT: retq + %in.subvec.neg = load <16 x i8>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <16 x i8> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 0 + store <16 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 1 + store <16 x i8> %in.subvec, ptr %out.subvec1.ptr, align 16 + %out.subvec2.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 2 + store <16 x i8> %in.subvec, ptr %out.subvec2.ptr, align 32 + %out.subvec3.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 3 + store <16 x i8> %in.subvec, ptr %out.subvec3.ptr, align 16 + ret void +} + +define void @vec512_v16i16(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec512_v16i16: +; SCALAR: # %bb.0: +; SCALAR-NEXT: pushq %rbp +; SCALAR-NEXT: pushq %r15 +; SCALAR-NEXT: pushq %r14 +; SCALAR-NEXT: pushq %r13 +; SCALAR-NEXT: pushq %r12 +; SCALAR-NEXT: pushq %rbx +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subw (%rdi), %ax +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subw 2(%rdi), %ax +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subw 4(%rdi), %ax +; SCALAR-NEXT: movl %eax, %r8d +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subw 6(%rdi), %ax +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subw 8(%rdi), %ax +; SCALAR-NEXT: movl %eax, %r9d +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %r10d, %r10d +; SCALAR-NEXT: subw 10(%rdi), %r10w +; SCALAR-NEXT: xorl %r11d, %r11d +; SCALAR-NEXT: subw 12(%rdi), %r11w +; SCALAR-NEXT: xorl %ebx, %ebx +; SCALAR-NEXT: subw 14(%rdi), %bx +; SCALAR-NEXT: xorl %ebp, %ebp +; SCALAR-NEXT: subw 16(%rdi), %bp +; SCALAR-NEXT: xorl %r14d, %r14d +; SCALAR-NEXT: subw 18(%rdi), %r14w +; SCALAR-NEXT: xorl %r15d, %r15d +; SCALAR-NEXT: subw 20(%rdi), %r15w +; SCALAR-NEXT: xorl %r12d, %r12d +; SCALAR-NEXT: subw 22(%rdi), %r12w +; SCALAR-NEXT: xorl %r13d, %r13d +; SCALAR-NEXT: subw 24(%rdi), %r13w +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: subw 26(%rdi), %dx +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subw 28(%rdi), %cx +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subw 30(%rdi), %ax +; SCALAR-NEXT: movw %ax, 30(%rsi) +; SCALAR-NEXT: movw %cx, 28(%rsi) +; SCALAR-NEXT: movw %dx, 26(%rsi) +; SCALAR-NEXT: movw %r13w, 24(%rsi) +; SCALAR-NEXT: movw %r12w, 22(%rsi) +; SCALAR-NEXT: movw %r15w, 20(%rsi) +; SCALAR-NEXT: movw %r14w, 18(%rsi) +; SCALAR-NEXT: movw %bp, 16(%rsi) +; SCALAR-NEXT: movw %bx, 14(%rsi) +; SCALAR-NEXT: movw %r11w, 12(%rsi) +; SCALAR-NEXT: movw %r10w, 10(%rsi) +; SCALAR-NEXT: movw %r9w, 8(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload +; SCALAR-NEXT: movw %r9w, 6(%rsi) +; SCALAR-NEXT: movw %r8w, 4(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload +; SCALAR-NEXT: movw %r8w, 2(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; SCALAR-NEXT: movw %di, (%rsi) +; SCALAR-NEXT: movw %ax, 62(%rsi) +; SCALAR-NEXT: movw %cx, 60(%rsi) +; SCALAR-NEXT: movw %dx, 58(%rsi) +; SCALAR-NEXT: movw %r13w, 56(%rsi) +; SCALAR-NEXT: movw %r12w, 54(%rsi) +; SCALAR-NEXT: movw %r15w, 52(%rsi) +; SCALAR-NEXT: movw %r14w, 50(%rsi) +; SCALAR-NEXT: movw %bp, 48(%rsi) +; SCALAR-NEXT: movw %bx, 46(%rsi) +; SCALAR-NEXT: movw %r11w, 44(%rsi) +; SCALAR-NEXT: movw %r10w, 42(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SCALAR-NEXT: movw %ax, 40(%rsi) +; SCALAR-NEXT: movw %r9w, 38(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SCALAR-NEXT: movw %ax, 36(%rsi) +; SCALAR-NEXT: movw %r8w, 34(%rsi) +; SCALAR-NEXT: movw %di, 32(%rsi) +; SCALAR-NEXT: popq %rbx +; SCALAR-NEXT: popq %r12 +; SCALAR-NEXT: popq %r13 +; SCALAR-NEXT: popq %r14 +; SCALAR-NEXT: popq %r15 +; SCALAR-NEXT: popq %rbp +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec512_v16i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubw 16(%rdi), %xmm1 +; SSE2-NEXT: psubw (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movdqa %xmm1, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, 48(%rsi) +; SSE2-NEXT: movdqa %xmm0, 32(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec512_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsubw 16(%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpsubw (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec512_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpsubw (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %in.subvec.neg = load <16 x i16>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <16 x i16> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <16 x i16>, ptr %out.vec.ptr, i64 0 + store <16 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <16 x i16>, ptr %out.vec.ptr, i64 1 + store <16 x i16> %in.subvec, ptr %out.subvec1.ptr, align 32 + ret void +} + +define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.vec.ptr) nounwind { +; SCALAR-LABEL: vec512_v32i8: +; SCALAR: # %bb.0: +; SCALAR-NEXT: pushq %rbp +; SCALAR-NEXT: pushq %r15 +; SCALAR-NEXT: pushq %r14 +; SCALAR-NEXT: pushq %r13 +; SCALAR-NEXT: pushq %r12 +; SCALAR-NEXT: pushq %rbx +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb (%rdi), %al +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 1(%rdi), %al +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 2(%rdi), %al +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 3(%rdi), %al +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 4(%rdi), %al +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 5(%rdi), %al +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 6(%rdi), %al +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 7(%rdi), %al +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 8(%rdi), %al +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 9(%rdi), %al +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 10(%rdi), %al +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 11(%rdi), %al +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 12(%rdi), %al +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 13(%rdi), %al +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 14(%rdi), %al +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 15(%rdi), %al +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 16(%rdi), %al +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 17(%rdi), %al +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 18(%rdi), %al +; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SCALAR-NEXT: xorl %r13d, %r13d +; SCALAR-NEXT: subb 19(%rdi), %r13b +; SCALAR-NEXT: xorl %r12d, %r12d +; SCALAR-NEXT: subb 20(%rdi), %r12b +; SCALAR-NEXT: xorl %r15d, %r15d +; SCALAR-NEXT: subb 21(%rdi), %r15b +; SCALAR-NEXT: xorl %r14d, %r14d +; SCALAR-NEXT: subb 22(%rdi), %r14b +; SCALAR-NEXT: xorl %ebp, %ebp +; SCALAR-NEXT: subb 23(%rdi), %bpl +; SCALAR-NEXT: xorl %ebx, %ebx +; SCALAR-NEXT: subb 24(%rdi), %bl +; SCALAR-NEXT: xorl %r11d, %r11d +; SCALAR-NEXT: subb 25(%rdi), %r11b +; SCALAR-NEXT: xorl %r10d, %r10d +; SCALAR-NEXT: subb 26(%rdi), %r10b +; SCALAR-NEXT: xorl %r9d, %r9d +; SCALAR-NEXT: subb 27(%rdi), %r9b +; SCALAR-NEXT: xorl %r8d, %r8d +; SCALAR-NEXT: subb 28(%rdi), %r8b +; SCALAR-NEXT: xorl %edx, %edx +; SCALAR-NEXT: subb 29(%rdi), %dl +; SCALAR-NEXT: xorl %ecx, %ecx +; SCALAR-NEXT: subb 30(%rdi), %cl +; SCALAR-NEXT: xorl %eax, %eax +; SCALAR-NEXT: subb 31(%rdi), %al +; SCALAR-NEXT: movb %al, 31(%rsi) +; SCALAR-NEXT: movb %cl, 30(%rsi) +; SCALAR-NEXT: movb %dl, 29(%rsi) +; SCALAR-NEXT: movb %r8b, 28(%rsi) +; SCALAR-NEXT: movb %r9b, 27(%rsi) +; SCALAR-NEXT: movb %r10b, 26(%rsi) +; SCALAR-NEXT: movb %r11b, 25(%rsi) +; SCALAR-NEXT: movb %bl, 24(%rsi) +; SCALAR-NEXT: movb %bpl, 23(%rsi) +; SCALAR-NEXT: movb %r14b, 22(%rsi) +; SCALAR-NEXT: movb %r15b, 21(%rsi) +; SCALAR-NEXT: movb %r12b, 20(%rsi) +; SCALAR-NEXT: movb %r13b, 19(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; SCALAR-NEXT: movb %dil, 18(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; SCALAR-NEXT: movb %dil, 17(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; SCALAR-NEXT: movb %dil, 16(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; SCALAR-NEXT: movb %dil, 15(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; SCALAR-NEXT: movb %dil, 14(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; SCALAR-NEXT: movb %dil, 13(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; SCALAR-NEXT: movb %dil, 12(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; SCALAR-NEXT: movb %dil, 11(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; SCALAR-NEXT: movb %dil, 10(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; SCALAR-NEXT: movb %dil, 9(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; SCALAR-NEXT: movb %dil, 8(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; SCALAR-NEXT: movb %dil, 7(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; SCALAR-NEXT: movb %dil, 6(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; SCALAR-NEXT: movb %dil, 5(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; SCALAR-NEXT: movb %dil, 4(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; SCALAR-NEXT: movb %dil, 3(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; SCALAR-NEXT: movb %dil, 2(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; SCALAR-NEXT: movb %dil, 1(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; SCALAR-NEXT: movb %dil, (%rsi) +; SCALAR-NEXT: movb %al, 63(%rsi) +; SCALAR-NEXT: movb %cl, 62(%rsi) +; SCALAR-NEXT: movb %dl, 61(%rsi) +; SCALAR-NEXT: movb %r8b, 60(%rsi) +; SCALAR-NEXT: movb %r9b, 59(%rsi) +; SCALAR-NEXT: movb %r10b, 58(%rsi) +; SCALAR-NEXT: movb %r11b, 57(%rsi) +; SCALAR-NEXT: movb %bl, 56(%rsi) +; SCALAR-NEXT: movb %bpl, 55(%rsi) +; SCALAR-NEXT: movb %r14b, 54(%rsi) +; SCALAR-NEXT: movb %r15b, 53(%rsi) +; SCALAR-NEXT: movb %r12b, 52(%rsi) +; SCALAR-NEXT: movb %r13b, 51(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SCALAR-NEXT: movb %al, 50(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SCALAR-NEXT: movb %al, 49(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SCALAR-NEXT: movb %al, 48(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SCALAR-NEXT: movb %al, 47(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SCALAR-NEXT: movb %al, 46(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SCALAR-NEXT: movb %al, 45(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SCALAR-NEXT: movb %al, 44(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SCALAR-NEXT: movb %al, 43(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SCALAR-NEXT: movb %al, 42(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SCALAR-NEXT: movb %al, 41(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SCALAR-NEXT: movb %al, 40(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SCALAR-NEXT: movb %al, 39(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SCALAR-NEXT: movb %al, 38(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SCALAR-NEXT: movb %al, 37(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SCALAR-NEXT: movb %al, 36(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SCALAR-NEXT: movb %al, 35(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SCALAR-NEXT: movb %al, 34(%rsi) +; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SCALAR-NEXT: movb %al, 33(%rsi) +; SCALAR-NEXT: movb %dil, 32(%rsi) +; SCALAR-NEXT: popq %rbx +; SCALAR-NEXT: popq %r12 +; SCALAR-NEXT: popq %r13 +; SCALAR-NEXT: popq %r14 +; SCALAR-NEXT: popq %r15 +; SCALAR-NEXT: popq %rbp +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec512_v32i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubb 16(%rdi), %xmm1 +; SSE2-NEXT: psubb (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movdqa %xmm1, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, 48(%rsi) +; SSE2-NEXT: movdqa %xmm0, 32(%rsi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec512_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb 16(%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpsubb (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec512_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %in.subvec.neg = load <32 x i8>, ptr %in.subvec.ptr, align 64 + %in.subvec = sub <32 x i8> zeroinitializer, %in.subvec.neg + %out.subvec0.ptr = getelementptr <32 x i8>, ptr %out.vec.ptr, i64 0 + store <32 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64 + %out.subvec1.ptr = getelementptr <32 x i8>, ptr %out.vec.ptr, i64 1 + store <32 x i8> %in.subvec, ptr %out.subvec1.ptr, align 32 + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; SSE: {{.*}} +; SSSE3: {{.*}} -- 2.7.4