From 471facf5468a33d1ddc995b3a0cbe077aa6152dc Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 14 Jan 2023 20:09:28 +0000 Subject: [PATCH] [X86] Add absolute-difference vector tests X86 doesn't have absdiff instructions, but we should be able to canonicalize to avoid unnecessary scalarization/extensions/selects --- llvm/test/CodeGen/X86/abds-vector-128.ll | 1893 ++++++++++++++++++++++++++++++ llvm/test/CodeGen/X86/abds-vector-256.ll | 1757 +++++++++++++++++++++++++++ llvm/test/CodeGen/X86/abds-vector-512.ll | 768 ++++++++++++ llvm/test/CodeGen/X86/abdu-vector-128.ll | 1668 ++++++++++++++++++++++++++ llvm/test/CodeGen/X86/abdu-vector-256.ll | 1614 +++++++++++++++++++++++++ llvm/test/CodeGen/X86/abdu-vector-512.ll | 630 ++++++++++ 6 files changed, 8330 insertions(+) create mode 100644 llvm/test/CodeGen/X86/abds-vector-128.ll create mode 100644 llvm/test/CodeGen/X86/abds-vector-256.ll create mode 100644 llvm/test/CodeGen/X86/abds-vector-512.ll create mode 100644 llvm/test/CodeGen/X86/abdu-vector-128.ll create mode 100644 llvm/test/CodeGen/X86/abdu-vector-256.ll create mode 100644 llvm/test/CodeGen/X86/abdu-vector-512.ll diff --git a/llvm/test/CodeGen/X86/abds-vector-128.ll b/llvm/test/CodeGen/X86/abds-vector-128.ll new file mode 100644 index 0000000..41384b1 --- /dev/null +++ b/llvm/test/CodeGen/X86/abds-vector-128.ll @@ -0,0 +1,1893 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42 +; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512 + +; +; trunc(abs(sub(sext(a),sext(b)))) -> abds(a,b) +; + +define <16 x i8> @abd_ext_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { +; SSE2-LABEL: abd_ext_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: psrad $24, %xmm3 +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE2-NEXT: psrad $24, %xmm5 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm2[8],xmm7[9],xmm2[9],xmm7[10],xmm2[10],xmm7[11],xmm2[11],xmm7[12],xmm2[12],xmm7[13],xmm2[13],xmm7[14],xmm2[14],xmm7[15],xmm2[15] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE2-NEXT: psrad $24, %xmm6 +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: movdqa %xmm6, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; SSE2-NEXT: psrad $24, %xmm8 +; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm10 +; SSE2-NEXT: movdqa %xmm8, %xmm7 +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE2-NEXT: psrad $24, %xmm11 +; SSE2-NEXT: pxor %xmm12, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-NEXT: movdqa %xmm11, %xmm13 +; SSE2-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; SSE2-NEXT: psubq %xmm13, %xmm0 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; SSE2-NEXT: psubq %xmm11, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $24, %xmm10 +; SSE2-NEXT: pxor %xmm11, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 +; SSE2-NEXT: movdqa %xmm10, %xmm12 +; SSE2-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; SSE2-NEXT: psubq %xmm12, %xmm4 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; SSE2-NEXT: psubq %xmm10, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE2-NEXT: psrad $24, %xmm10 +; SSE2-NEXT: pxor %xmm11, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 +; SSE2-NEXT: movdqa %xmm10, %xmm12 +; SSE2-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; SSE2-NEXT: psubq %xmm12, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; SSE2-NEXT: psubq %xmm10, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm9 +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; SSE2-NEXT: psubq %xmm10, %xmm7 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE2-NEXT: psubq %xmm1, %xmm8 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: psubq %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: psubq %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: psubq %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: psubq %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: psubq %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm7, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm7 +; SSE2-NEXT: psubq %xmm1, %xmm7 +; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm8 +; SSE2-NEXT: psubq %xmm1, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE2-NEXT: pand %xmm1, %xmm8 +; SSE2-NEXT: pand %xmm1, %xmm7 +; SSE2-NEXT: packuswb %xmm8, %xmm7 +; SSE2-NEXT: pand %xmm1, %xmm6 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: packuswb %xmm6, %xmm2 +; SSE2-NEXT: packuswb %xmm7, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm5 +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: packuswb %xmm5, %xmm4 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm3, %xmm0 +; SSE2-NEXT: packuswb %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE42-LABEL: abd_ext_v16i8: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: psrld $16, %xmm2 +; SSE42-NEXT: pmovsxbq %xmm2, %xmm2 +; SSE42-NEXT: pmovsxbq %xmm0, %xmm9 +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; SSE42-NEXT: pmovsxbq %xmm3, %xmm3 +; SSE42-NEXT: movdqa %xmm0, %xmm4 +; SSE42-NEXT: psrlq $48, %xmm4 +; SSE42-NEXT: pmovsxbq %xmm4, %xmm5 +; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSE42-NEXT: pmovsxbq %xmm4, %xmm4 +; SSE42-NEXT: movdqa %xmm0, %xmm6 +; SSE42-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pmovsxbq %xmm6, %xmm6 +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; SSE42-NEXT: pmovsxbq %xmm7, %xmm7 +; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pmovsxbq %xmm0, %xmm8 +; SSE42-NEXT: movdqa %xmm1, %xmm0 +; SSE42-NEXT: psrld $16, %xmm0 +; SSE42-NEXT: pmovsxbq %xmm0, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm2 +; SSE42-NEXT: pmovsxbq %xmm1, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm9 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE42-NEXT: pmovsxbq %xmm0, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm3 +; SSE42-NEXT: movdqa %xmm1, %xmm0 +; SSE42-NEXT: psrlq $48, %xmm0 +; SSE42-NEXT: pmovsxbq %xmm0, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm5 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE42-NEXT: pmovsxbq %xmm0, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm4 +; SSE42-NEXT: movdqa %xmm1, %xmm0 +; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pmovsxbq %xmm0, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm6 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] +; SSE42-NEXT: pmovsxbq %xmm0, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm7 +; SSE42-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pmovsxbq %xmm1, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm8 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pxor %xmm10, %xmm10 +; SSE42-NEXT: psubq %xmm2, %xmm10 +; SSE42-NEXT: movdqa %xmm2, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm2 +; SSE42-NEXT: pxor %xmm10, %xmm10 +; SSE42-NEXT: psubq %xmm9, %xmm10 +; SSE42-NEXT: movdqa %xmm9, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm9 +; SSE42-NEXT: pxor %xmm10, %xmm10 +; SSE42-NEXT: psubq %xmm3, %xmm10 +; SSE42-NEXT: movdqa %xmm3, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm3 +; SSE42-NEXT: pxor %xmm10, %xmm10 +; SSE42-NEXT: psubq %xmm5, %xmm10 +; SSE42-NEXT: movdqa %xmm5, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm5 +; SSE42-NEXT: pxor %xmm10, %xmm10 +; SSE42-NEXT: psubq %xmm4, %xmm10 +; SSE42-NEXT: movdqa %xmm4, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm4 +; SSE42-NEXT: pxor %xmm10, %xmm10 +; SSE42-NEXT: psubq %xmm6, %xmm10 +; SSE42-NEXT: movdqa %xmm6, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm6 +; SSE42-NEXT: pxor %xmm10, %xmm10 +; SSE42-NEXT: psubq %xmm7, %xmm10 +; SSE42-NEXT: movdqa %xmm7, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm7 +; SSE42-NEXT: psubq %xmm8, %xmm1 +; SSE42-NEXT: movdqa %xmm8, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm8 +; SSE42-NEXT: movapd {{.*#+}} xmm0 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE42-NEXT: andpd %xmm0, %xmm8 +; SSE42-NEXT: andpd %xmm0, %xmm7 +; SSE42-NEXT: packusdw %xmm8, %xmm7 +; SSE42-NEXT: andpd %xmm0, %xmm6 +; SSE42-NEXT: andpd %xmm0, %xmm4 +; SSE42-NEXT: packusdw %xmm6, %xmm4 +; SSE42-NEXT: packusdw %xmm7, %xmm4 +; SSE42-NEXT: andpd %xmm0, %xmm5 +; SSE42-NEXT: andpd %xmm0, %xmm3 +; SSE42-NEXT: packusdw %xmm5, %xmm3 +; SSE42-NEXT: andpd %xmm0, %xmm9 +; SSE42-NEXT: andpd %xmm0, %xmm2 +; SSE42-NEXT: packusdw %xmm2, %xmm9 +; SSE42-NEXT: packusdw %xmm3, %xmm9 +; SSE42-NEXT: packuswb %xmm4, %xmm9 +; SSE42-NEXT: movdqa %xmm9, %xmm0 +; SSE42-NEXT: retq +; +; AVX1-LABEL: abd_ext_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovsxbw %xmm2, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm4, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm5, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; AVX1-NEXT: vpmovsxwq %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm7, %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm6, %xmm6 +; AVX1-NEXT: vpmovsxbq %xmm0, %xmm8 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovsxbw %xmm9, %xmm10 +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm11, %xmm11 +; AVX1-NEXT: vpsubq %xmm11, %xmm4, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm11, %xmm11 +; AVX1-NEXT: vpsubq %xmm11, %xmm5, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,1,1] +; AVX1-NEXT: vpmovsxwq %xmm10, %xmm10 +; AVX1-NEXT: vpsubq %xmm10, %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxbq %xmm9, %xmm9 +; AVX1-NEXT: vpsubq %xmm9, %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxbw %xmm1, %xmm9 +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm10, %xmm10 +; AVX1-NEXT: vpsubq %xmm10, %xmm7, %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm9, %xmm9 +; AVX1-NEXT: vpsubq %xmm9, %xmm6, %xmm6 +; AVX1-NEXT: vpmovsxbq %xmm1, %xmm9 +; AVX1-NEXT: vpsubq %xmm9, %xmm8, %xmm8 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxbq %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm9 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm10 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm11 +; AVX1-NEXT: vpxor %xmm12, %xmm12, %xmm12 +; AVX1-NEXT: vpsubq %xmm5, %xmm12, %xmm5 +; AVX1-NEXT: vpsubq %xmm4, %xmm12, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-NEXT: vblendvpd %ymm11, %ymm4, %ymm11, %ymm4 +; AVX1-NEXT: vpsubq %xmm2, %xmm12, %xmm2 +; AVX1-NEXT: vpsubq %xmm3, %xmm12, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vblendvpd %ymm10, %ymm2, %ymm10, %ymm2 +; AVX1-NEXT: vpsubq %xmm6, %xmm12, %xmm3 +; AVX1-NEXT: vpsubq %xmm7, %xmm12, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-NEXT: vblendvpd %ymm9, %ymm3, %ymm9, %ymm3 +; AVX1-NEXT: vpsubq %xmm0, %xmm12, %xmm0 +; AVX1-NEXT: vpsubq %xmm8, %xmm12, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-NEXT: vblendvpd %ymm1, %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [255,255,255,255] +; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vandpd %ymm1, %ymm3, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vandpd %ymm1, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %ymm1, %ymm4, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_ext_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxbw %xmm0, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm3, %ymm3 +; AVX2-NEXT: vpmovsxbq %xmm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm4, %ymm4 +; AVX2-NEXT: vpmovsxwq %xmm2, %ymm2 +; AVX2-NEXT: vpmovsxbw %xmm1, %ymm5 +; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm6, %ymm6 +; AVX2-NEXT: vpsubq %ymm6, %ymm3, %ymm3 +; AVX2-NEXT: vpmovsxbq %xmm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm5, %ymm5 +; AVX2-NEXT: vpsubq %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpmovsxwq %xmm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsubq %ymm3, %ymm2, %ymm5 +; AVX2-NEXT: vblendvpd %ymm3, %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vpsubq %ymm0, %ymm2, %ymm5 +; AVX2-NEXT: vblendvpd %ymm0, %ymm5, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm4, %ymm2, %ymm5 +; AVX2-NEXT: vblendvpd %ymm4, %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm2 +; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255] +; AVX2-NEXT: vandpd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vandpd %ymm2, %ymm4, %ymm4 +; AVX2-NEXT: vpackusdw %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vandpd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vandpd %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_ext_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512-NEXT: vpmovsxwq %xmm0, %zmm2 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512-NEXT: vpmovsxbw %xmm1, %ymm1 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm3 +; AVX512-NEXT: vpsubq %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm1 +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm2, %zmm1 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %aext = sext <16 x i8> %a to <16 x i64> + %bext = sext <16 x i8> %b to <16 x i64> + %sub = sub <16 x i64> %aext, %bext + %abs = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %sub, i1 false) + %trunc = trunc <16 x i64> %abs to <16 x i8> + ret <16 x i8> %trunc +} + +define <16 x i8> @abd_ext_v16i8_undef(<16 x i8> %a, <16 x i8> %b) nounwind { +; SSE2-LABEL: abd_ext_v16i8_undef: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: psrad $24, %xmm3 +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE2-NEXT: psrad $24, %xmm5 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm2[8],xmm7[9],xmm2[9],xmm7[10],xmm2[10],xmm7[11],xmm2[11],xmm7[12],xmm2[12],xmm7[13],xmm2[13],xmm7[14],xmm2[14],xmm7[15],xmm2[15] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE2-NEXT: psrad $24, %xmm6 +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: movdqa %xmm6, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; SSE2-NEXT: psrad $24, %xmm8 +; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm10 +; SSE2-NEXT: movdqa %xmm8, %xmm7 +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE2-NEXT: psrad $24, %xmm11 +; SSE2-NEXT: pxor %xmm12, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-NEXT: movdqa %xmm11, %xmm13 +; SSE2-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; SSE2-NEXT: psubq %xmm13, %xmm0 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; SSE2-NEXT: psubq %xmm11, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $24, %xmm10 +; SSE2-NEXT: pxor %xmm11, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 +; SSE2-NEXT: movdqa %xmm10, %xmm12 +; SSE2-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; SSE2-NEXT: psubq %xmm12, %xmm4 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; SSE2-NEXT: psubq %xmm10, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE2-NEXT: psrad $24, %xmm10 +; SSE2-NEXT: pxor %xmm11, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 +; SSE2-NEXT: movdqa %xmm10, %xmm12 +; SSE2-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; SSE2-NEXT: psubq %xmm12, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; SSE2-NEXT: psubq %xmm10, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm9 +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; SSE2-NEXT: psubq %xmm10, %xmm7 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE2-NEXT: psubq %xmm1, %xmm8 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: psubq %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: psubq %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: psubq %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: psubq %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: psubq %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm7, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm7 +; SSE2-NEXT: psubq %xmm1, %xmm7 +; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm8 +; SSE2-NEXT: psubq %xmm1, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE2-NEXT: pand %xmm1, %xmm8 +; SSE2-NEXT: pand %xmm1, %xmm7 +; SSE2-NEXT: packuswb %xmm8, %xmm7 +; SSE2-NEXT: pand %xmm1, %xmm6 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: packuswb %xmm6, %xmm2 +; SSE2-NEXT: packuswb %xmm7, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm5 +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: packuswb %xmm5, %xmm4 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm3, %xmm0 +; SSE2-NEXT: packuswb %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE42-LABEL: abd_ext_v16i8_undef: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: psrld $16, %xmm2 +; SSE42-NEXT: pmovsxbq %xmm2, %xmm2 +; SSE42-NEXT: pmovsxbq %xmm0, %xmm9 +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; SSE42-NEXT: pmovsxbq %xmm3, %xmm3 +; SSE42-NEXT: movdqa %xmm0, %xmm4 +; SSE42-NEXT: psrlq $48, %xmm4 +; SSE42-NEXT: pmovsxbq %xmm4, %xmm5 +; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSE42-NEXT: pmovsxbq %xmm4, %xmm4 +; SSE42-NEXT: movdqa %xmm0, %xmm6 +; SSE42-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pmovsxbq %xmm6, %xmm6 +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; SSE42-NEXT: pmovsxbq %xmm7, %xmm7 +; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pmovsxbq %xmm0, %xmm8 +; SSE42-NEXT: movdqa %xmm1, %xmm0 +; SSE42-NEXT: psrld $16, %xmm0 +; SSE42-NEXT: pmovsxbq %xmm0, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm2 +; SSE42-NEXT: pmovsxbq %xmm1, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm9 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE42-NEXT: pmovsxbq %xmm0, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm3 +; SSE42-NEXT: movdqa %xmm1, %xmm0 +; SSE42-NEXT: psrlq $48, %xmm0 +; SSE42-NEXT: pmovsxbq %xmm0, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm5 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE42-NEXT: pmovsxbq %xmm0, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm4 +; SSE42-NEXT: movdqa %xmm1, %xmm0 +; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pmovsxbq %xmm0, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm6 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] +; SSE42-NEXT: pmovsxbq %xmm0, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm7 +; SSE42-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pmovsxbq %xmm1, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm8 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pxor %xmm10, %xmm10 +; SSE42-NEXT: psubq %xmm2, %xmm10 +; SSE42-NEXT: movdqa %xmm2, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm2 +; SSE42-NEXT: pxor %xmm10, %xmm10 +; SSE42-NEXT: psubq %xmm9, %xmm10 +; SSE42-NEXT: movdqa %xmm9, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm9 +; SSE42-NEXT: pxor %xmm10, %xmm10 +; SSE42-NEXT: psubq %xmm3, %xmm10 +; SSE42-NEXT: movdqa %xmm3, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm3 +; SSE42-NEXT: pxor %xmm10, %xmm10 +; SSE42-NEXT: psubq %xmm5, %xmm10 +; SSE42-NEXT: movdqa %xmm5, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm5 +; SSE42-NEXT: pxor %xmm10, %xmm10 +; SSE42-NEXT: psubq %xmm4, %xmm10 +; SSE42-NEXT: movdqa %xmm4, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm4 +; SSE42-NEXT: pxor %xmm10, %xmm10 +; SSE42-NEXT: psubq %xmm6, %xmm10 +; SSE42-NEXT: movdqa %xmm6, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm6 +; SSE42-NEXT: pxor %xmm10, %xmm10 +; SSE42-NEXT: psubq %xmm7, %xmm10 +; SSE42-NEXT: movdqa %xmm7, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm7 +; SSE42-NEXT: psubq %xmm8, %xmm1 +; SSE42-NEXT: movdqa %xmm8, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm8 +; SSE42-NEXT: movapd {{.*#+}} xmm0 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE42-NEXT: andpd %xmm0, %xmm8 +; SSE42-NEXT: andpd %xmm0, %xmm7 +; SSE42-NEXT: packusdw %xmm8, %xmm7 +; SSE42-NEXT: andpd %xmm0, %xmm6 +; SSE42-NEXT: andpd %xmm0, %xmm4 +; SSE42-NEXT: packusdw %xmm6, %xmm4 +; SSE42-NEXT: packusdw %xmm7, %xmm4 +; SSE42-NEXT: andpd %xmm0, %xmm5 +; SSE42-NEXT: andpd %xmm0, %xmm3 +; SSE42-NEXT: packusdw %xmm5, %xmm3 +; SSE42-NEXT: andpd %xmm0, %xmm9 +; SSE42-NEXT: andpd %xmm0, %xmm2 +; SSE42-NEXT: packusdw %xmm2, %xmm9 +; SSE42-NEXT: packusdw %xmm3, %xmm9 +; SSE42-NEXT: packuswb %xmm4, %xmm9 +; SSE42-NEXT: movdqa %xmm9, %xmm0 +; SSE42-NEXT: retq +; +; AVX1-LABEL: abd_ext_v16i8_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovsxbw %xmm2, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm4, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm5, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; AVX1-NEXT: vpmovsxwq %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm7, %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm6, %xmm6 +; AVX1-NEXT: vpmovsxbq %xmm0, %xmm8 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovsxbw %xmm9, %xmm10 +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm11, %xmm11 +; AVX1-NEXT: vpsubq %xmm11, %xmm4, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm11, %xmm11 +; AVX1-NEXT: vpsubq %xmm11, %xmm5, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,1,1] +; AVX1-NEXT: vpmovsxwq %xmm10, %xmm10 +; AVX1-NEXT: vpsubq %xmm10, %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxbq %xmm9, %xmm9 +; AVX1-NEXT: vpsubq %xmm9, %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxbw %xmm1, %xmm9 +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm10, %xmm10 +; AVX1-NEXT: vpsubq %xmm10, %xmm7, %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm9, %xmm9 +; AVX1-NEXT: vpsubq %xmm9, %xmm6, %xmm6 +; AVX1-NEXT: vpmovsxbq %xmm1, %xmm9 +; AVX1-NEXT: vpsubq %xmm9, %xmm8, %xmm8 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxbq %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm9 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm10 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm11 +; AVX1-NEXT: vpxor %xmm12, %xmm12, %xmm12 +; AVX1-NEXT: vpsubq %xmm5, %xmm12, %xmm5 +; AVX1-NEXT: vpsubq %xmm4, %xmm12, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-NEXT: vblendvpd %ymm11, %ymm4, %ymm11, %ymm4 +; AVX1-NEXT: vpsubq %xmm2, %xmm12, %xmm2 +; AVX1-NEXT: vpsubq %xmm3, %xmm12, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vblendvpd %ymm10, %ymm2, %ymm10, %ymm2 +; AVX1-NEXT: vpsubq %xmm6, %xmm12, %xmm3 +; AVX1-NEXT: vpsubq %xmm7, %xmm12, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-NEXT: vblendvpd %ymm9, %ymm3, %ymm9, %ymm3 +; AVX1-NEXT: vpsubq %xmm0, %xmm12, %xmm0 +; AVX1-NEXT: vpsubq %xmm8, %xmm12, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-NEXT: vblendvpd %ymm1, %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [255,255,255,255] +; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vandpd %ymm1, %ymm3, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vandpd %ymm1, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %ymm1, %ymm4, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_ext_v16i8_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxbw %xmm0, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm3, %ymm3 +; AVX2-NEXT: vpmovsxbq %xmm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm4, %ymm4 +; AVX2-NEXT: vpmovsxwq %xmm2, %ymm2 +; AVX2-NEXT: vpmovsxbw %xmm1, %ymm5 +; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm6, %ymm6 +; AVX2-NEXT: vpsubq %ymm6, %ymm3, %ymm3 +; AVX2-NEXT: vpmovsxbq %xmm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm5, %ymm5 +; AVX2-NEXT: vpsubq %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpmovsxwq %xmm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsubq %ymm3, %ymm2, %ymm5 +; AVX2-NEXT: vblendvpd %ymm3, %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vpsubq %ymm0, %ymm2, %ymm5 +; AVX2-NEXT: vblendvpd %ymm0, %ymm5, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm4, %ymm2, %ymm5 +; AVX2-NEXT: vblendvpd %ymm4, %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm2 +; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255] +; AVX2-NEXT: vandpd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vandpd %ymm2, %ymm4, %ymm4 +; AVX2-NEXT: vpackusdw %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vandpd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vandpd %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_ext_v16i8_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512-NEXT: vpmovsxwq %xmm0, %zmm2 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512-NEXT: vpmovsxbw %xmm1, %ymm1 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm3 +; AVX512-NEXT: vpsubq %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm1 +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm2, %zmm1 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %aext = sext <16 x i8> %a to <16 x i64> + %bext = sext <16 x i8> %b to <16 x i64> + %sub = sub <16 x i64> %aext, %bext + %abs = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %sub, i1 true) + %trunc = trunc <16 x i64> %abs to <16 x i8> + ret <16 x i8> %trunc +} + +define <8 x i16> @abd_ext_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { +; SSE2-LABEL: abd_ext_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] +; SSE2-NEXT: psrad $16, %xmm6 +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; SSE2-NEXT: psubq %xmm8, %xmm3 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE2-NEXT: psubq %xmm6, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE2-NEXT: psubq %xmm6, %xmm4 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: psubq %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: psubq %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: psubq %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: retq +; +; SSE42-LABEL: abd_ext_v8i16: +; SSE42: # %bb.0: +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; SSE42-NEXT: pmovsxwq %xmm2, %xmm2 +; SSE42-NEXT: pmovsxwq %xmm0, %xmm5 +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; SSE42-NEXT: pmovsxwq %xmm3, %xmm3 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE42-NEXT: pmovsxwq %xmm0, %xmm4 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE42-NEXT: pmovsxwq %xmm0, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm2 +; SSE42-NEXT: pmovsxwq %xmm1, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm5 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE42-NEXT: pmovsxwq %xmm0, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm3 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] +; SSE42-NEXT: pmovsxwq %xmm0, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm4 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pxor %xmm6, %xmm6 +; SSE42-NEXT: psubq %xmm2, %xmm6 +; SSE42-NEXT: movdqa %xmm2, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm2 +; SSE42-NEXT: pxor %xmm6, %xmm6 +; SSE42-NEXT: psubq %xmm5, %xmm6 +; SSE42-NEXT: movdqa %xmm5, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm5 +; SSE42-NEXT: pxor %xmm6, %xmm6 +; SSE42-NEXT: psubq %xmm3, %xmm6 +; SSE42-NEXT: movdqa %xmm3, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm3 +; SSE42-NEXT: pxor %xmm6, %xmm6 +; SSE42-NEXT: psubq %xmm4, %xmm6 +; SSE42-NEXT: movdqa %xmm4, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm4 +; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1,2,3],xmm4[4],xmm1[5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1,2,3],xmm3[4],xmm1[5,6,7] +; SSE42-NEXT: packusdw %xmm4, %xmm3 +; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0],xmm1[1,2,3],xmm5[4],xmm1[5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7] +; SSE42-NEXT: packusdw %xmm2, %xmm5 +; SSE42-NEXT: packusdw %xmm3, %xmm5 +; SSE42-NEXT: movdqa %xmm5, %xmm0 +; SSE42-NEXT: retq +; +; AVX1-LABEL: abd_ext_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxwq %xmm0, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm5, %xmm5 +; AVX1-NEXT: vpsubq %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm5, %xmm5 +; AVX1-NEXT: vpsubq %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxwq %xmm1, %xmm5 +; AVX1-NEXT: vpsubq %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX1-NEXT: vpmovsxwq %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm5 +; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vpsubq %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vpsubq %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vblendvpd %ymm5, %ymm2, %ymm5, %ymm2 +; AVX1-NEXT: vpsubq %xmm0, %xmm6, %xmm0 +; AVX1-NEXT: vpsubq %xmm4, %xmm6, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: vblendvpd %ymm1, %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [65535,65535,65535,65535] +; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vandpd %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_ext_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm2, %ymm2 +; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm3, %ymm3 +; AVX2-NEXT: vpsubq %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpmovsxwq %xmm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubq %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm3 +; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_ext_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm1 +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %aext = sext <8 x i16> %a to <8 x i64> + %bext = sext <8 x i16> %b to <8 x i64> + %sub = sub <8 x i64> %aext, %bext + %abs = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %sub, i1 false) + %trunc = trunc <8 x i64> %abs to <8 x i16> + ret <8 x i16> %trunc +} + +define <8 x i16> @abd_ext_v8i16_undef(<8 x i16> %a, <8 x i16> %b) nounwind { +; SSE2-LABEL: abd_ext_v8i16_undef: +; SSE2: # %bb.0: +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] +; SSE2-NEXT: psrad $16, %xmm6 +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; SSE2-NEXT: psubq %xmm8, %xmm3 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE2-NEXT: psubq %xmm6, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE2-NEXT: psubq %xmm6, %xmm4 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: psubq %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: psubq %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: psubq %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: retq +; +; SSE42-LABEL: abd_ext_v8i16_undef: +; SSE42: # %bb.0: +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; SSE42-NEXT: pmovsxwq %xmm2, %xmm2 +; SSE42-NEXT: pmovsxwq %xmm0, %xmm5 +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; SSE42-NEXT: pmovsxwq %xmm3, %xmm3 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE42-NEXT: pmovsxwq %xmm0, %xmm4 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE42-NEXT: pmovsxwq %xmm0, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm2 +; SSE42-NEXT: pmovsxwq %xmm1, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm5 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE42-NEXT: pmovsxwq %xmm0, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm3 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] +; SSE42-NEXT: pmovsxwq %xmm0, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm4 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pxor %xmm6, %xmm6 +; SSE42-NEXT: psubq %xmm2, %xmm6 +; SSE42-NEXT: movdqa %xmm2, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm2 +; SSE42-NEXT: pxor %xmm6, %xmm6 +; SSE42-NEXT: psubq %xmm5, %xmm6 +; SSE42-NEXT: movdqa %xmm5, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm5 +; SSE42-NEXT: pxor %xmm6, %xmm6 +; SSE42-NEXT: psubq %xmm3, %xmm6 +; SSE42-NEXT: movdqa %xmm3, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm3 +; SSE42-NEXT: pxor %xmm6, %xmm6 +; SSE42-NEXT: psubq %xmm4, %xmm6 +; SSE42-NEXT: movdqa %xmm4, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm4 +; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1,2,3],xmm4[4],xmm1[5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1,2,3],xmm3[4],xmm1[5,6,7] +; SSE42-NEXT: packusdw %xmm4, %xmm3 +; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0],xmm1[1,2,3],xmm5[4],xmm1[5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7] +; SSE42-NEXT: packusdw %xmm2, %xmm5 +; SSE42-NEXT: packusdw %xmm3, %xmm5 +; SSE42-NEXT: movdqa %xmm5, %xmm0 +; SSE42-NEXT: retq +; +; AVX1-LABEL: abd_ext_v8i16_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxwq %xmm0, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm5, %xmm5 +; AVX1-NEXT: vpsubq %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm5, %xmm5 +; AVX1-NEXT: vpsubq %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxwq %xmm1, %xmm5 +; AVX1-NEXT: vpsubq %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX1-NEXT: vpmovsxwq %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm5 +; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vpsubq %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vpsubq %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vblendvpd %ymm5, %ymm2, %ymm5, %ymm2 +; AVX1-NEXT: vpsubq %xmm0, %xmm6, %xmm0 +; AVX1-NEXT: vpsubq %xmm4, %xmm6, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: vblendvpd %ymm1, %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [65535,65535,65535,65535] +; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vandpd %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_ext_v8i16_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm2, %ymm2 +; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm3, %ymm3 +; AVX2-NEXT: vpsubq %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpmovsxwq %xmm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubq %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm3 +; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_ext_v8i16_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm1 +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %aext = sext <8 x i16> %a to <8 x i64> + %bext = sext <8 x i16> %b to <8 x i64> + %sub = sub <8 x i64> %aext, %bext + %abs = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %sub, i1 true) + %trunc = trunc <8 x i64> %abs to <8 x i16> + ret <8 x i16> %trunc +} + +define <4 x i32> @abd_ext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { +; SSE2-LABEL: abd_ext_v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE2-NEXT: psubq %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: psubq %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; SSE2-NEXT: retq +; +; SSE42-LABEL: abd_ext_v4i32: +; SSE42: # %bb.0: +; SSE42-NEXT: pmovsxdq %xmm0, %xmm2 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE42-NEXT: pmovsxdq %xmm0, %xmm3 +; SSE42-NEXT: pmovsxdq %xmm1, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm2 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE42-NEXT: pmovsxdq %xmm0, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm3 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pxor %xmm4, %xmm4 +; SSE42-NEXT: psubq %xmm2, %xmm4 +; SSE42-NEXT: movdqa %xmm2, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE42-NEXT: psubq %xmm3, %xmm1 +; SSE42-NEXT: movdqa %xmm3, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; SSE42-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE42-NEXT: movaps %xmm2, %xmm0 +; SSE42-NEXT: retq +; +; AVX1-LABEL: abd_ext_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm3 +; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpsubq %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vblendvpd %ymm1, %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_ext_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm1 +; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_ext_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX512-NEXT: vpmovsxdq %xmm1, %ymm1 +; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpabsq %ymm0, %ymm0 +; AVX512-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %aext = sext <4 x i32> %a to <4 x i64> + %bext = sext <4 x i32> %b to <4 x i64> + %sub = sub <4 x i64> %aext, %bext + %abs = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %sub, i1 false) + %trunc = trunc <4 x i64> %abs to <4 x i32> + ret <4 x i32> %trunc +} + +define <4 x i32> @abd_ext_v4i32_undef(<4 x i32> %a, <4 x i32> %b) nounwind { +; SSE2-LABEL: abd_ext_v4i32_undef: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE2-NEXT: psubq %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: psubq %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; SSE2-NEXT: retq +; +; SSE42-LABEL: abd_ext_v4i32_undef: +; SSE42: # %bb.0: +; SSE42-NEXT: pmovsxdq %xmm0, %xmm2 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE42-NEXT: pmovsxdq %xmm0, %xmm3 +; SSE42-NEXT: pmovsxdq %xmm1, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm2 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE42-NEXT: pmovsxdq %xmm0, %xmm0 +; SSE42-NEXT: psubq %xmm0, %xmm3 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pxor %xmm4, %xmm4 +; SSE42-NEXT: psubq %xmm2, %xmm4 +; SSE42-NEXT: movdqa %xmm2, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE42-NEXT: psubq %xmm3, %xmm1 +; SSE42-NEXT: movdqa %xmm3, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; SSE42-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE42-NEXT: movaps %xmm2, %xmm0 +; SSE42-NEXT: retq +; +; AVX1-LABEL: abd_ext_v4i32_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm3 +; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpsubq %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vblendvpd %ymm1, %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_ext_v4i32_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm1 +; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_ext_v4i32_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX512-NEXT: vpmovsxdq %xmm1, %ymm1 +; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpabsq %ymm0, %ymm0 +; AVX512-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %aext = sext <4 x i32> %a to <4 x i64> + %bext = sext <4 x i32> %b to <4 x i64> + %sub = sub <4 x i64> %aext, %bext + %abs = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %sub, i1 true) + %trunc = trunc <4 x i64> %abs to <4 x i32> + ret <4 x i32> %trunc +} + +define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { +; SSE2-LABEL: abd_ext_v2i64: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: sarq $63, %rcx +; SSE2-NEXT: movq %xmm0, %rdx +; SSE2-NEXT: movq %rdx, %rsi +; SSE2-NEXT: sarq $63, %rsi +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE2-NEXT: movq %xmm0, %rdi +; SSE2-NEXT: movq %rdi, %r8 +; SSE2-NEXT: sarq $63, %r8 +; SSE2-NEXT: movq %xmm1, %r9 +; SSE2-NEXT: movq %r9, %r10 +; SSE2-NEXT: sarq $63, %r10 +; SSE2-NEXT: subq %r9, %rdx +; SSE2-NEXT: sbbq %r10, %rsi +; SSE2-NEXT: subq %rdi, %rax +; SSE2-NEXT: sbbq %r8, %rcx +; SSE2-NEXT: sarq $63, %rcx +; SSE2-NEXT: xorq %rcx, %rax +; SSE2-NEXT: subq %rcx, %rax +; SSE2-NEXT: sarq $63, %rsi +; SSE2-NEXT: xorq %rsi, %rdx +; SSE2-NEXT: subq %rsi, %rdx +; SSE2-NEXT: movq %rdx, %xmm0 +; SSE2-NEXT: movq %rax, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSE42-LABEL: abd_ext_v2i64: +; SSE42: # %bb.0: +; SSE42-NEXT: movq %xmm0, %rax +; SSE42-NEXT: movq %rax, %rcx +; SSE42-NEXT: sarq $63, %rcx +; SSE42-NEXT: pextrq $1, %xmm0, %rdx +; SSE42-NEXT: movq %rdx, %rsi +; SSE42-NEXT: sarq $63, %rsi +; SSE42-NEXT: movq %xmm1, %rdi +; SSE42-NEXT: movq %rdi, %r8 +; SSE42-NEXT: sarq $63, %r8 +; SSE42-NEXT: pextrq $1, %xmm1, %r9 +; SSE42-NEXT: movq %r9, %r10 +; SSE42-NEXT: sarq $63, %r10 +; SSE42-NEXT: subq %r9, %rdx +; SSE42-NEXT: sbbq %r10, %rsi +; SSE42-NEXT: subq %rdi, %rax +; SSE42-NEXT: sbbq %r8, %rcx +; SSE42-NEXT: sarq $63, %rcx +; SSE42-NEXT: xorq %rcx, %rax +; SSE42-NEXT: subq %rcx, %rax +; SSE42-NEXT: sarq $63, %rsi +; SSE42-NEXT: xorq %rsi, %rdx +; SSE42-NEXT: subq %rsi, %rdx +; SSE42-NEXT: movq %rdx, %xmm1 +; SSE42-NEXT: movq %rax, %xmm0 +; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE42-NEXT: retq +; +; AVX-LABEL: abd_ext_v2i64: +; AVX: # %bb.0: +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: movq %rax, %rcx +; AVX-NEXT: sarq $63, %rcx +; AVX-NEXT: vpextrq $1, %xmm0, %rdx +; AVX-NEXT: movq %rdx, %rsi +; AVX-NEXT: sarq $63, %rsi +; AVX-NEXT: vmovq %xmm1, %rdi +; AVX-NEXT: movq %rdi, %r8 +; AVX-NEXT: sarq $63, %r8 +; AVX-NEXT: vpextrq $1, %xmm1, %r9 +; AVX-NEXT: movq %r9, %r10 +; AVX-NEXT: sarq $63, %r10 +; AVX-NEXT: subq %r9, %rdx +; AVX-NEXT: sbbq %r10, %rsi +; AVX-NEXT: subq %rdi, %rax +; AVX-NEXT: sbbq %r8, %rcx +; AVX-NEXT: sarq $63, %rcx +; AVX-NEXT: xorq %rcx, %rax +; AVX-NEXT: subq %rcx, %rax +; AVX-NEXT: sarq $63, %rsi +; AVX-NEXT: xorq %rsi, %rdx +; AVX-NEXT: subq %rsi, %rdx +; AVX-NEXT: vmovq %rdx, %xmm0 +; AVX-NEXT: vmovq %rax, %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: retq + %aext = sext <2 x i64> %a to <2 x i128> + %bext = sext <2 x i64> %b to <2 x i128> + %sub = sub <2 x i128> %aext, %bext + %abs = call <2 x i128> @llvm.abs.v2i128(<2 x i128> %sub, i1 false) + %trunc = trunc <2 x i128> %abs to <2 x i64> + ret <2 x i64> %trunc +} + +define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind { +; SSE2-LABEL: abd_ext_v2i64_undef: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: sarq $63, %rcx +; SSE2-NEXT: movq %xmm0, %rdx +; SSE2-NEXT: movq %rdx, %rsi +; SSE2-NEXT: sarq $63, %rsi +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE2-NEXT: movq %xmm0, %rdi +; SSE2-NEXT: movq %rdi, %r8 +; SSE2-NEXT: sarq $63, %r8 +; SSE2-NEXT: movq %xmm1, %r9 +; SSE2-NEXT: movq %r9, %r10 +; SSE2-NEXT: sarq $63, %r10 +; SSE2-NEXT: subq %r9, %rdx +; SSE2-NEXT: sbbq %r10, %rsi +; SSE2-NEXT: subq %rdi, %rax +; SSE2-NEXT: sbbq %r8, %rcx +; SSE2-NEXT: sarq $63, %rcx +; SSE2-NEXT: xorq %rcx, %rax +; SSE2-NEXT: subq %rcx, %rax +; SSE2-NEXT: sarq $63, %rsi +; SSE2-NEXT: xorq %rsi, %rdx +; SSE2-NEXT: subq %rsi, %rdx +; SSE2-NEXT: movq %rdx, %xmm0 +; SSE2-NEXT: movq %rax, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSE42-LABEL: abd_ext_v2i64_undef: +; SSE42: # %bb.0: +; SSE42-NEXT: movq %xmm0, %rax +; SSE42-NEXT: movq %rax, %rcx +; SSE42-NEXT: sarq $63, %rcx +; SSE42-NEXT: pextrq $1, %xmm0, %rdx +; SSE42-NEXT: movq %rdx, %rsi +; SSE42-NEXT: sarq $63, %rsi +; SSE42-NEXT: movq %xmm1, %rdi +; SSE42-NEXT: movq %rdi, %r8 +; SSE42-NEXT: sarq $63, %r8 +; SSE42-NEXT: pextrq $1, %xmm1, %r9 +; SSE42-NEXT: movq %r9, %r10 +; SSE42-NEXT: sarq $63, %r10 +; SSE42-NEXT: subq %r9, %rdx +; SSE42-NEXT: sbbq %r10, %rsi +; SSE42-NEXT: subq %rdi, %rax +; SSE42-NEXT: sbbq %r8, %rcx +; SSE42-NEXT: sarq $63, %rcx +; SSE42-NEXT: xorq %rcx, %rax +; SSE42-NEXT: subq %rcx, %rax +; SSE42-NEXT: sarq $63, %rsi +; SSE42-NEXT: xorq %rsi, %rdx +; SSE42-NEXT: subq %rsi, %rdx +; SSE42-NEXT: movq %rdx, %xmm1 +; SSE42-NEXT: movq %rax, %xmm0 +; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE42-NEXT: retq +; +; AVX-LABEL: abd_ext_v2i64_undef: +; AVX: # %bb.0: +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: movq %rax, %rcx +; AVX-NEXT: sarq $63, %rcx +; AVX-NEXT: vpextrq $1, %xmm0, %rdx +; AVX-NEXT: movq %rdx, %rsi +; AVX-NEXT: sarq $63, %rsi +; AVX-NEXT: vmovq %xmm1, %rdi +; AVX-NEXT: movq %rdi, %r8 +; AVX-NEXT: sarq $63, %r8 +; AVX-NEXT: vpextrq $1, %xmm1, %r9 +; AVX-NEXT: movq %r9, %r10 +; AVX-NEXT: sarq $63, %r10 +; AVX-NEXT: subq %r9, %rdx +; AVX-NEXT: sbbq %r10, %rsi +; AVX-NEXT: subq %rdi, %rax +; AVX-NEXT: sbbq %r8, %rcx +; AVX-NEXT: sarq $63, %rcx +; AVX-NEXT: xorq %rcx, %rax +; AVX-NEXT: subq %rcx, %rax +; AVX-NEXT: sarq $63, %rsi +; AVX-NEXT: xorq %rsi, %rdx +; AVX-NEXT: subq %rsi, %rdx +; AVX-NEXT: vmovq %rdx, %xmm0 +; AVX-NEXT: vmovq %rax, %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: retq + %aext = sext <2 x i64> %a to <2 x i128> + %bext = sext <2 x i64> %b to <2 x i128> + %sub = sub <2 x i128> %aext, %bext + %abs = call <2 x i128> @llvm.abs.v2i128(<2 x i128> %sub, i1 true) + %trunc = trunc <2 x i128> %abs to <2 x i64> + ret <2 x i64> %trunc +} + +; +; sub(smax(a,b),smin(a,b)) -> abds(a,b) +; + +define <16 x i8> @abd_minmax_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { +; SSE2-LABEL: abd_minmax_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: psubb %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE42-LABEL: abd_minmax_v16i8: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: pminsb %xmm1, %xmm2 +; SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; SSE42-NEXT: psubb %xmm2, %xmm0 +; SSE42-NEXT: retq +; +; AVX-LABEL: abd_minmax_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsubb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq + %min = call <16 x i8> @llvm.smin.v16i8(<16 x i8> %a, <16 x i8> %b) + %max = call <16 x i8> @llvm.smax.v16i8(<16 x i8> %a, <16 x i8> %b) + %sub = sub <16 x i8> %max, %min + ret <16 x i8> %sub +} + +define <8 x i16> @abd_minmax_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { +; SSE-LABEL: abd_minmax_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pminsw %xmm1, %xmm2 +; SSE-NEXT: pmaxsw %xmm1, %xmm0 +; SSE-NEXT: psubw %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: abd_minmax_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsubw %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq + %min = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %b) + %max = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %a, <8 x i16> %b) + %sub = sub <8 x i16> %max, %min + ret <8 x i16> %sub +} + +define <4 x i32> @abd_minmax_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { +; SSE2-LABEL: abd_minmax_v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: psubd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE42-LABEL: abd_minmax_v4i32: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: pminsd %xmm1, %xmm2 +; SSE42-NEXT: pmaxsd %xmm1, %xmm0 +; SSE42-NEXT: psubd %xmm2, %xmm0 +; SSE42-NEXT: retq +; +; AVX-LABEL: abd_minmax_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsubd %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq + %min = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %a, <4 x i32> %b) + %max = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> %b) + %sub = sub <4 x i32> %max, %min + ret <4 x i32> %sub +} + +define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { +; SSE2-LABEL: abd_minmax_v2i64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: psubq %xmm4, %xmm0 +; SSE2-NEXT: retq +; +; SSE42-LABEL: abd_minmax_v2i64: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; SSE42-NEXT: movdqa %xmm2, %xmm3 +; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE42-NEXT: psubq %xmm3, %xmm1 +; SSE42-NEXT: movdqa %xmm1, %xmm0 +; SSE42-NEXT: retq +; +; AVX1-LABEL: abd_minmax_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_minmax_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm3 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsubq %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_minmax_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: retq + %min = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %a, <2 x i64> %b) + %max = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %a, <2 x i64> %b) + %sub = sub <2 x i64> %max, %min + ret <2 x i64> %sub +} + +; +; abs(sub_nsw(x, y)) -> abds(a,b) +; + +define <16 x i8> @abd_subnsw_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { +; SSE2-LABEL: abd_subnsw_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubb %xmm0, %xmm1 +; SSE2-NEXT: pminub %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE42-LABEL: abd_subnsw_v16i8: +; SSE42: # %bb.0: +; SSE42-NEXT: psubb %xmm1, %xmm0 +; SSE42-NEXT: pabsb %xmm0, %xmm0 +; SSE42-NEXT: retq +; +; AVX-LABEL: abd_subnsw_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpabsb %xmm0, %xmm0 +; AVX-NEXT: retq + %sub = sub nsw <16 x i8> %a, %b + %abs = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %sub, i1 false) + ret <16 x i8> %abs +} + +define <8 x i16> @abd_subnsw_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { +; SSE2-LABEL: abd_subnsw_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: psubw %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubw %xmm0, %xmm1 +; SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE42-LABEL: abd_subnsw_v8i16: +; SSE42: # %bb.0: +; SSE42-NEXT: psubw %xmm1, %xmm0 +; SSE42-NEXT: pabsw %xmm0, %xmm0 +; SSE42-NEXT: retq +; +; AVX-LABEL: abd_subnsw_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpabsw %xmm0, %xmm0 +; AVX-NEXT: retq + %sub = sub nsw <8 x i16> %a, %b + %abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %sub, i1 false) + ret <8 x i16> %abs +} + +define <4 x i32> @abd_subnsw_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { +; SSE2-LABEL: abd_subnsw_v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: psubd %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: psubd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE42-LABEL: abd_subnsw_v4i32: +; SSE42: # %bb.0: +; SSE42-NEXT: psubd %xmm1, %xmm0 +; SSE42-NEXT: pabsd %xmm0, %xmm0 +; SSE42-NEXT: retq +; +; AVX-LABEL: abd_subnsw_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpabsd %xmm0, %xmm0 +; AVX-NEXT: retq + %sub = sub nsw <4 x i32> %a, %b + %abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %sub, i1 false) + ret <4 x i32> %abs +} + +define <2 x i64> @abd_subnsw_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { +; SSE2-LABEL: abd_subnsw_v2i64: +; SSE2: # %bb.0: +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE42-LABEL: abd_subnsw_v2i64: +; SSE42: # %bb.0: +; SSE42-NEXT: psubq %xmm1, %xmm0 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: psubq %xmm0, %xmm1 +; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm0 +; SSE42-NEXT: retq +; +; AVX1-LABEL: abd_subnsw_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_subnsw_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm1 +; AVX2-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_subnsw_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpabsq %xmm0, %xmm0 +; AVX512-NEXT: retq + %sub = sub nsw <2 x i64> %a, %b + %abs = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %sub, i1 false) + ret <2 x i64> %abs +} + +declare <16 x i8> @llvm.abs.v16i8(<16 x i8>, i1) +declare <8 x i16> @llvm.abs.v8i16(<8 x i16>, i1) +declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1) +declare <2 x i64> @llvm.abs.v2i64(<2 x i64>, i1) +declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1) +declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1) +declare <16 x i64> @llvm.abs.v16i64(<16 x i64>, i1) +declare <2 x i128> @llvm.abs.v2i128(<2 x i128>, i1) + +declare <16 x i8> @llvm.smax.v16i8(<16 x i8>, <16 x i8>) +declare <8 x i16> @llvm.smax.v8i16(<8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>) +declare <2 x i64> @llvm.smax.v2i64(<2 x i64>, <2 x i64>) + +declare <16 x i8> @llvm.smin.v16i8(<16 x i8>, <16 x i8>) +declare <8 x i16> @llvm.smin.v8i16(<8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>) +declare <2 x i64> @llvm.smin.v2i64(<2 x i64>, <2 x i64>) diff --git a/llvm/test/CodeGen/X86/abds-vector-256.ll b/llvm/test/CodeGen/X86/abds-vector-256.ll new file mode 100644 index 0000000..b24903f --- /dev/null +++ b/llvm/test/CodeGen/X86/abds-vector-256.ll @@ -0,0 +1,1757 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX1 +; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 +; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512 + +; +; trunc(abs(sub(sext(a),sext(b)))) -> abds(a,b) +; + +define <32 x i8> @abd_ext_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { +; AVX1-LABEL: abd_ext_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovsxbw %xmm4, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm3, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm5, %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxbq %xmm4, %xmm10 +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm5, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm4, %xmm11 +; AVX1-NEXT: vpmovsxbq %xmm0, %xmm4 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm8 +; AVX1-NEXT: vpmovsxbq %xmm8, %xmm3 +; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[2,3,2,3] +; AVX1-NEXT: vpmovsxbw %xmm8, %xmm12 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm9, %xmm9 +; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovsxbw %xmm13, %xmm14 +; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm15, %xmm15 +; AVX1-NEXT: vpsubq %xmm15, %xmm6, %xmm0 +; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm15, %xmm15 +; AVX1-NEXT: vpsubq %xmm15, %xmm7, %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,1,1,1] +; AVX1-NEXT: vpmovsxwq %xmm14, %xmm14 +; AVX1-NEXT: vpsubq %xmm14, %xmm2, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpmovsxbq %xmm13, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm10, %xmm6 +; AVX1-NEXT: vpmovsxbw %xmm1, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm13, %xmm13 +; AVX1-NEXT: vpsubq %xmm13, %xmm5, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm11, %xmm11 +; AVX1-NEXT: vpmovsxbq %xmm1, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm0 +; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm2 +; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm10 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX1-NEXT: vpmovsxbw %xmm1, %xmm13 +; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm14, %xmm14 +; AVX1-NEXT: vpsubq %xmm14, %xmm9, %xmm9 +; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm14, %xmm14 +; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm15, %xmm15 +; AVX1-NEXT: vpsubq %xmm15, %xmm14, %xmm15 +; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,1,1] +; AVX1-NEXT: vpmovsxwq %xmm12, %xmm12 +; AVX1-NEXT: vpmovsxbq %xmm8, %xmm14 +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[1,1,1,1] +; AVX1-NEXT: vpmovsxwq %xmm8, %xmm8 +; AVX1-NEXT: vpsubq %xmm8, %xmm12, %xmm8 +; AVX1-NEXT: vpmovsxbw %xmm3, %xmm12 +; AVX1-NEXT: vpmovsxbq %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm14, %xmm14 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxbw %xmm2, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm13, %xmm13 +; AVX1-NEXT: vpsubq %xmm13, %xmm1, %xmm13 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxbq %xmm3, %xmm0 +; AVX1-NEXT: vpmovsxbq %xmm2, %xmm12 +; AVX1-NEXT: vpsubq %xmm12, %xmm0, %xmm12 +; AVX1-NEXT: vpsrld $16, %xmm3, %xmm0 +; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm3 +; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm7, %xmm0, %xmm7 +; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 +; AVX1-NEXT: vblendvpd %ymm3, %ymm4, %ymm3, %ymm3 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm4 +; AVX1-NEXT: vpsubq %xmm6, %xmm0, %xmm6 +; AVX1-NEXT: vpsubq %xmm7, %xmm0, %xmm7 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-NEXT: vblendvpd %ymm4, %ymm6, %ymm4, %ymm6 +; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm5, %ymm4 +; AVX1-NEXT: vpsubq %xmm11, %xmm0, %xmm7 +; AVX1-NEXT: vpsubq %xmm5, %xmm0, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX1-NEXT: vblendvpd %ymm4, %ymm5, %ymm4, %ymm5 +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm4 +; AVX1-NEXT: vpsubq %xmm10, %xmm0, %xmm7 +; AVX1-NEXT: vpsubq %xmm11, %xmm0, %xmm10 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm7 +; AVX1-NEXT: vblendvpd %ymm4, %ymm7, %ymm4, %ymm4 +; AVX1-NEXT: vinsertf128 $1, %xmm15, %ymm9, %ymm7 +; AVX1-NEXT: vpsubq %xmm15, %xmm0, %xmm10 +; AVX1-NEXT: vpsubq %xmm9, %xmm0, %xmm9 +; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 +; AVX1-NEXT: vblendvpd %ymm7, %ymm9, %ymm7, %ymm7 +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm14, %ymm9 +; AVX1-NEXT: vpsubq %xmm14, %xmm0, %xmm10 +; AVX1-NEXT: vpsubq %xmm8, %xmm0, %xmm8 +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 +; AVX1-NEXT: vblendvpd %ymm9, %ymm8, %ymm9, %ymm8 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm9 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpsubq %xmm13, %xmm0, %xmm10 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm10, %ymm1 +; AVX1-NEXT: vblendvpd %ymm9, %ymm1, %ymm9, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm9 +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpsubq %xmm12, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vblendvpd %ymm9, %ymm0, %ymm9, %ymm2 +; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [255,255,255,255] +; AVX1-NEXT: vandpd %ymm0, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm9 +; AVX1-NEXT: vpackusdw %xmm9, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %ymm0, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9 +; AVX1-NEXT: vpackusdw %xmm9, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vandpd %ymm0, %ymm8, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8 +; AVX1-NEXT: vpackusdw %xmm8, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %ymm0, %ymm7, %ymm7 +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8 +; AVX1-NEXT: vpackusdw %xmm8, %xmm7, %xmm7 +; AVX1-NEXT: vpackusdw %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vandpd %ymm0, %ymm4, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %ymm0, %ymm5, %ymm4 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %ymm0, %ymm6, %ymm4 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %ymm0, %ymm3, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_ext_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxbw %xmm0, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm3, %ymm3 +; AVX2-NEXT: vpmovsxbq %xmm0, %ymm4 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm5, %ymm5 +; AVX2-NEXT: vpmovsxwq %xmm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpmovsxbw %xmm0, %ymm6 +; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm7, %ymm7 +; AVX2-NEXT: vpmovsxbq %xmm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm8, %ymm8 +; AVX2-NEXT: vpmovsxwq %xmm6, %ymm6 +; AVX2-NEXT: vpmovsxbw %xmm1, %ymm9 +; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm10, %ymm10 +; AVX2-NEXT: vpsubq %ymm10, %ymm3, %ymm3 +; AVX2-NEXT: vpmovsxbq %xmm1, %ymm10 +; AVX2-NEXT: vpsubq %ymm10, %ymm4, %ymm4 +; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm10, %ymm10 +; AVX2-NEXT: vpsubq %ymm10, %ymm5, %ymm5 +; AVX2-NEXT: vpmovsxwq %xmm9, %ymm9 +; AVX2-NEXT: vpsubq %ymm9, %ymm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vpmovsxbw %xmm1, %ymm9 +; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm10, %ymm10 +; AVX2-NEXT: vpsubq %ymm10, %ymm7, %ymm7 +; AVX2-NEXT: vpmovsxbq %xmm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm9, %ymm9 +; AVX2-NEXT: vpsubq %ymm9, %ymm8, %ymm8 +; AVX2-NEXT: vpmovsxwq %xmm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm6, %ymm1 +; AVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX2-NEXT: vpsubq %ymm3, %ymm6, %ymm9 +; AVX2-NEXT: vblendvpd %ymm3, %ymm9, %ymm3, %ymm3 +; AVX2-NEXT: vpsubq %ymm4, %ymm6, %ymm9 +; AVX2-NEXT: vblendvpd %ymm4, %ymm9, %ymm4, %ymm4 +; AVX2-NEXT: vpsubq %ymm5, %ymm6, %ymm9 +; AVX2-NEXT: vblendvpd %ymm5, %ymm9, %ymm5, %ymm5 +; AVX2-NEXT: vpsubq %ymm2, %ymm6, %ymm9 +; AVX2-NEXT: vblendvpd %ymm2, %ymm9, %ymm2, %ymm2 +; AVX2-NEXT: vpsubq %ymm7, %ymm6, %ymm9 +; AVX2-NEXT: vblendvpd %ymm7, %ymm9, %ymm7, %ymm7 +; AVX2-NEXT: vpsubq %ymm0, %ymm6, %ymm9 +; AVX2-NEXT: vblendvpd %ymm0, %ymm9, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm8, %ymm6, %ymm9 +; AVX2-NEXT: vblendvpd %ymm8, %ymm9, %ymm8, %ymm8 +; AVX2-NEXT: vpsubq %ymm1, %ymm6, %ymm6 +; AVX2-NEXT: vblendvpd %ymm1, %ymm6, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [255,255,255,255] +; AVX2-NEXT: vandpd %ymm6, %ymm1, %ymm1 +; AVX2-NEXT: vandpd %ymm6, %ymm8, %ymm8 +; AVX2-NEXT: vpackusdw %ymm8, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vandpd %ymm6, %ymm0, %ymm0 +; AVX2-NEXT: vandpd %ymm6, %ymm7, %ymm7 +; AVX2-NEXT: vpackusdw %ymm7, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vandpd %ymm6, %ymm2, %ymm1 +; AVX2-NEXT: vandpd %ymm6, %ymm5, %ymm2 +; AVX2-NEXT: vpackusdw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vandpd %ymm6, %ymm4, %ymm2 +; AVX2-NEXT: vandpd %ymm6, %ymm3, %ymm3 +; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_ext_v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxbw %xmm0, %ymm2 +; AVX512-NEXT: vpmovsxwq %xmm2, %zmm3 +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vpmovsxwq %xmm2, %zmm2 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512-NEXT: vpmovsxwq %xmm0, %zmm4 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512-NEXT: vpmovsxbw %xmm1, %ymm5 +; AVX512-NEXT: vpmovsxwq %xmm5, %zmm6 +; AVX512-NEXT: vpsubq %zmm6, %zmm3, %zmm3 +; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512-NEXT: vpmovsxwq %xmm5, %zmm5 +; AVX512-NEXT: vpsubq %zmm5, %zmm2, %zmm2 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpmovsxbw %xmm1, %ymm1 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm5 +; AVX512-NEXT: vpsubq %zmm5, %zmm4, %zmm4 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm1 +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm3, %zmm1 +; AVX512-NEXT: vpabsq %zmm2, %zmm2 +; AVX512-NEXT: vpabsq %zmm4, %zmm3 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpmovqb %zmm3, %xmm3 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX512-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: retq + %aext = sext <32 x i8> %a to <32 x i64> + %bext = sext <32 x i8> %b to <32 x i64> + %sub = sub <32 x i64> %aext, %bext + %abs = call <32 x i64> @llvm.abs.v32i64(<32 x i64> %sub, i1 false) + %trunc = trunc <32 x i64> %abs to <32 x i8> + ret <32 x i8> %trunc +} + +define <32 x i8> @abd_ext_v32i8_undef(<32 x i8> %a, <32 x i8> %b) nounwind { +; AVX1-LABEL: abd_ext_v32i8_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovsxbw %xmm4, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm3, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm5, %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxbq %xmm4, %xmm10 +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm5, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm4, %xmm11 +; AVX1-NEXT: vpmovsxbq %xmm0, %xmm4 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm8 +; AVX1-NEXT: vpmovsxbq %xmm8, %xmm3 +; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[2,3,2,3] +; AVX1-NEXT: vpmovsxbw %xmm8, %xmm12 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm9, %xmm9 +; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovsxbw %xmm13, %xmm14 +; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm15, %xmm15 +; AVX1-NEXT: vpsubq %xmm15, %xmm6, %xmm0 +; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm15, %xmm15 +; AVX1-NEXT: vpsubq %xmm15, %xmm7, %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,1,1,1] +; AVX1-NEXT: vpmovsxwq %xmm14, %xmm14 +; AVX1-NEXT: vpsubq %xmm14, %xmm2, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpmovsxbq %xmm13, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm10, %xmm6 +; AVX1-NEXT: vpmovsxbw %xmm1, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm13, %xmm13 +; AVX1-NEXT: vpsubq %xmm13, %xmm5, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm11, %xmm11 +; AVX1-NEXT: vpmovsxbq %xmm1, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm0 +; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm2 +; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm10 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX1-NEXT: vpmovsxbw %xmm1, %xmm13 +; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm14, %xmm14 +; AVX1-NEXT: vpsubq %xmm14, %xmm9, %xmm9 +; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm14, %xmm14 +; AVX1-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm15, %xmm15 +; AVX1-NEXT: vpsubq %xmm15, %xmm14, %xmm15 +; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,1,1] +; AVX1-NEXT: vpmovsxwq %xmm12, %xmm12 +; AVX1-NEXT: vpmovsxbq %xmm8, %xmm14 +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[1,1,1,1] +; AVX1-NEXT: vpmovsxwq %xmm8, %xmm8 +; AVX1-NEXT: vpsubq %xmm8, %xmm12, %xmm8 +; AVX1-NEXT: vpmovsxbw %xmm3, %xmm12 +; AVX1-NEXT: vpmovsxbq %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm14, %xmm14 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxbw %xmm2, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm13, %xmm13 +; AVX1-NEXT: vpsubq %xmm13, %xmm1, %xmm13 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxbq %xmm3, %xmm0 +; AVX1-NEXT: vpmovsxbq %xmm2, %xmm12 +; AVX1-NEXT: vpsubq %xmm12, %xmm0, %xmm12 +; AVX1-NEXT: vpsrld $16, %xmm3, %xmm0 +; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2 +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm3 +; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm7, %xmm0, %xmm7 +; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 +; AVX1-NEXT: vblendvpd %ymm3, %ymm4, %ymm3, %ymm3 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm4 +; AVX1-NEXT: vpsubq %xmm6, %xmm0, %xmm6 +; AVX1-NEXT: vpsubq %xmm7, %xmm0, %xmm7 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-NEXT: vblendvpd %ymm4, %ymm6, %ymm4, %ymm6 +; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm5, %ymm4 +; AVX1-NEXT: vpsubq %xmm11, %xmm0, %xmm7 +; AVX1-NEXT: vpsubq %xmm5, %xmm0, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX1-NEXT: vblendvpd %ymm4, %ymm5, %ymm4, %ymm5 +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm4 +; AVX1-NEXT: vpsubq %xmm10, %xmm0, %xmm7 +; AVX1-NEXT: vpsubq %xmm11, %xmm0, %xmm10 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm7 +; AVX1-NEXT: vblendvpd %ymm4, %ymm7, %ymm4, %ymm4 +; AVX1-NEXT: vinsertf128 $1, %xmm15, %ymm9, %ymm7 +; AVX1-NEXT: vpsubq %xmm15, %xmm0, %xmm10 +; AVX1-NEXT: vpsubq %xmm9, %xmm0, %xmm9 +; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 +; AVX1-NEXT: vblendvpd %ymm7, %ymm9, %ymm7, %ymm7 +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm14, %ymm9 +; AVX1-NEXT: vpsubq %xmm14, %xmm0, %xmm10 +; AVX1-NEXT: vpsubq %xmm8, %xmm0, %xmm8 +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 +; AVX1-NEXT: vblendvpd %ymm9, %ymm8, %ymm9, %ymm8 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm9 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpsubq %xmm13, %xmm0, %xmm10 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm10, %ymm1 +; AVX1-NEXT: vblendvpd %ymm9, %ymm1, %ymm9, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm9 +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpsubq %xmm12, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vblendvpd %ymm9, %ymm0, %ymm9, %ymm2 +; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [255,255,255,255] +; AVX1-NEXT: vandpd %ymm0, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm9 +; AVX1-NEXT: vpackusdw %xmm9, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %ymm0, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9 +; AVX1-NEXT: vpackusdw %xmm9, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vandpd %ymm0, %ymm8, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8 +; AVX1-NEXT: vpackusdw %xmm8, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %ymm0, %ymm7, %ymm7 +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8 +; AVX1-NEXT: vpackusdw %xmm8, %xmm7, %xmm7 +; AVX1-NEXT: vpackusdw %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vandpd %ymm0, %ymm4, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %ymm0, %ymm5, %ymm4 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %ymm0, %ymm6, %ymm4 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %ymm0, %ymm3, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_ext_v32i8_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxbw %xmm0, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm3, %ymm3 +; AVX2-NEXT: vpmovsxbq %xmm0, %ymm4 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm5, %ymm5 +; AVX2-NEXT: vpmovsxwq %xmm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpmovsxbw %xmm0, %ymm6 +; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm7, %ymm7 +; AVX2-NEXT: vpmovsxbq %xmm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm8, %ymm8 +; AVX2-NEXT: vpmovsxwq %xmm6, %ymm6 +; AVX2-NEXT: vpmovsxbw %xmm1, %ymm9 +; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm10, %ymm10 +; AVX2-NEXT: vpsubq %ymm10, %ymm3, %ymm3 +; AVX2-NEXT: vpmovsxbq %xmm1, %ymm10 +; AVX2-NEXT: vpsubq %ymm10, %ymm4, %ymm4 +; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm10, %ymm10 +; AVX2-NEXT: vpsubq %ymm10, %ymm5, %ymm5 +; AVX2-NEXT: vpmovsxwq %xmm9, %ymm9 +; AVX2-NEXT: vpsubq %ymm9, %ymm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vpmovsxbw %xmm1, %ymm9 +; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm10, %ymm10 +; AVX2-NEXT: vpsubq %ymm10, %ymm7, %ymm7 +; AVX2-NEXT: vpmovsxbq %xmm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm9, %ymm9 +; AVX2-NEXT: vpsubq %ymm9, %ymm8, %ymm8 +; AVX2-NEXT: vpmovsxwq %xmm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm6, %ymm1 +; AVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX2-NEXT: vpsubq %ymm3, %ymm6, %ymm9 +; AVX2-NEXT: vblendvpd %ymm3, %ymm9, %ymm3, %ymm3 +; AVX2-NEXT: vpsubq %ymm4, %ymm6, %ymm9 +; AVX2-NEXT: vblendvpd %ymm4, %ymm9, %ymm4, %ymm4 +; AVX2-NEXT: vpsubq %ymm5, %ymm6, %ymm9 +; AVX2-NEXT: vblendvpd %ymm5, %ymm9, %ymm5, %ymm5 +; AVX2-NEXT: vpsubq %ymm2, %ymm6, %ymm9 +; AVX2-NEXT: vblendvpd %ymm2, %ymm9, %ymm2, %ymm2 +; AVX2-NEXT: vpsubq %ymm7, %ymm6, %ymm9 +; AVX2-NEXT: vblendvpd %ymm7, %ymm9, %ymm7, %ymm7 +; AVX2-NEXT: vpsubq %ymm0, %ymm6, %ymm9 +; AVX2-NEXT: vblendvpd %ymm0, %ymm9, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm8, %ymm6, %ymm9 +; AVX2-NEXT: vblendvpd %ymm8, %ymm9, %ymm8, %ymm8 +; AVX2-NEXT: vpsubq %ymm1, %ymm6, %ymm6 +; AVX2-NEXT: vblendvpd %ymm1, %ymm6, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [255,255,255,255] +; AVX2-NEXT: vandpd %ymm6, %ymm1, %ymm1 +; AVX2-NEXT: vandpd %ymm6, %ymm8, %ymm8 +; AVX2-NEXT: vpackusdw %ymm8, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vandpd %ymm6, %ymm0, %ymm0 +; AVX2-NEXT: vandpd %ymm6, %ymm7, %ymm7 +; AVX2-NEXT: vpackusdw %ymm7, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vandpd %ymm6, %ymm2, %ymm1 +; AVX2-NEXT: vandpd %ymm6, %ymm5, %ymm2 +; AVX2-NEXT: vpackusdw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vandpd %ymm6, %ymm4, %ymm2 +; AVX2-NEXT: vandpd %ymm6, %ymm3, %ymm3 +; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_ext_v32i8_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxbw %xmm0, %ymm2 +; AVX512-NEXT: vpmovsxwq %xmm2, %zmm3 +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vpmovsxwq %xmm2, %zmm2 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512-NEXT: vpmovsxwq %xmm0, %zmm4 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512-NEXT: vpmovsxbw %xmm1, %ymm5 +; AVX512-NEXT: vpmovsxwq %xmm5, %zmm6 +; AVX512-NEXT: vpsubq %zmm6, %zmm3, %zmm3 +; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512-NEXT: vpmovsxwq %xmm5, %zmm5 +; AVX512-NEXT: vpsubq %zmm5, %zmm2, %zmm2 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpmovsxbw %xmm1, %ymm1 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm5 +; AVX512-NEXT: vpsubq %zmm5, %zmm4, %zmm4 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm1 +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm3, %zmm1 +; AVX512-NEXT: vpabsq %zmm2, %zmm2 +; AVX512-NEXT: vpabsq %zmm4, %zmm3 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpmovqb %zmm3, %xmm3 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX512-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: retq + %aext = sext <32 x i8> %a to <32 x i64> + %bext = sext <32 x i8> %b to <32 x i64> + %sub = sub <32 x i64> %aext, %bext + %abs = call <32 x i64> @llvm.abs.v32i64(<32 x i64> %sub, i1 true) + %trunc = trunc <32 x i64> %abs to <32 x i8> + ret <32 x i8> %trunc +} + +define <16 x i16> @abd_ext_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { +; AVX1-LABEL: abd_ext_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxwq %xmm0, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] +; AVX1-NEXT: vpmovsxwq %xmm5, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm6, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm7, %xmm7 +; AVX1-NEXT: vpmovsxwq %xmm0, %xmm8 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm9, %xmm9 +; AVX1-NEXT: vpsubq %xmm9, %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm9, %xmm9 +; AVX1-NEXT: vpsubq %xmm9, %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxwq %xmm1, %xmm9 +; AVX1-NEXT: vpsubq %xmm9, %xmm4, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[1,1,1,1] +; AVX1-NEXT: vpmovsxwq %xmm9, %xmm9 +; AVX1-NEXT: vpsubq %xmm9, %xmm5, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm9, %xmm9 +; AVX1-NEXT: vpsubq %xmm9, %xmm6, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm9, %xmm9 +; AVX1-NEXT: vpsubq %xmm9, %xmm7, %xmm7 +; AVX1-NEXT: vpmovsxwq %xmm1, %xmm9 +; AVX1-NEXT: vpsubq %xmm9, %xmm8, %xmm8 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX1-NEXT: vpmovsxwq %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm9 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm10 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm11 +; AVX1-NEXT: vpxor %xmm12, %xmm12, %xmm12 +; AVX1-NEXT: vpsubq %xmm3, %xmm12, %xmm3 +; AVX1-NEXT: vpsubq %xmm2, %xmm12, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vblendvpd %ymm11, %ymm2, %ymm11, %ymm2 +; AVX1-NEXT: vpsubq %xmm5, %xmm12, %xmm3 +; AVX1-NEXT: vpsubq %xmm4, %xmm12, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vblendvpd %ymm10, %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vpsubq %xmm7, %xmm12, %xmm4 +; AVX1-NEXT: vpsubq %xmm6, %xmm12, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-NEXT: vblendvpd %ymm9, %ymm4, %ymm9, %ymm4 +; AVX1-NEXT: vpsubq %xmm0, %xmm12, %xmm0 +; AVX1-NEXT: vpsubq %xmm8, %xmm12, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-NEXT: vblendvpd %ymm1, %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [65535,65535,65535,65535] +; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vandpd %ymm1, %ymm4, %ymm4 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vandpd %ymm1, %ymm3, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_ext_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm2, %ymm2 +; AVX2-NEXT: vpmovsxwq %xmm0, %ymm3 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm4, %ymm4 +; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm5, %ymm5 +; AVX2-NEXT: vpsubq %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vpmovsxwq %xmm1, %ymm5 +; AVX2-NEXT: vpsubq %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm5, %ymm5 +; AVX2-NEXT: vpsubq %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpmovsxwq %xmm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubq %ymm2, %ymm1, %ymm5 +; AVX2-NEXT: vblendvpd %ymm2, %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm5 +; AVX2-NEXT: vblendvpd %ymm3, %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vpsubq %ymm4, %ymm1, %ymm5 +; AVX2-NEXT: vblendvpd %ymm4, %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm5 +; AVX2-NEXT: vblendvpd %ymm0, %ymm5, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm1[1,2,3],ymm4[4],ymm1[5,6,7],ymm4[8],ymm1[9,10,11],ymm4[12],ymm1[13,14,15] +; AVX2-NEXT: vpackusdw %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6,7],ymm3[8],ymm1[9,10,11],ymm3[12],ymm1[13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] +; AVX2-NEXT: vpackusdw %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_ext_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512-NEXT: vpmovsxwq %xmm2, %zmm2 +; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512-NEXT: vpmovsxwq %xmm3, %zmm3 +; AVX512-NEXT: vpsubq %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm1 +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm2, %zmm1 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpmovqw %zmm1, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq + %aext = sext <16 x i16> %a to <16 x i64> + %bext = sext <16 x i16> %b to <16 x i64> + %sub = sub <16 x i64> %aext, %bext + %abs = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %sub, i1 false) + %trunc = trunc <16 x i64> %abs to <16 x i16> + ret <16 x i16> %trunc +} + +define <16 x i16> @abd_ext_v16i16_undef(<16 x i16> %a, <16 x i16> %b) nounwind { +; AVX1-LABEL: abd_ext_v16i16_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxwq %xmm0, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] +; AVX1-NEXT: vpmovsxwq %xmm5, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm6, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm7, %xmm7 +; AVX1-NEXT: vpmovsxwq %xmm0, %xmm8 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm9, %xmm9 +; AVX1-NEXT: vpsubq %xmm9, %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm9, %xmm9 +; AVX1-NEXT: vpsubq %xmm9, %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxwq %xmm1, %xmm9 +; AVX1-NEXT: vpsubq %xmm9, %xmm4, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[1,1,1,1] +; AVX1-NEXT: vpmovsxwq %xmm9, %xmm9 +; AVX1-NEXT: vpsubq %xmm9, %xmm5, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovsxwq %xmm9, %xmm9 +; AVX1-NEXT: vpsubq %xmm9, %xmm6, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm9, %xmm9 +; AVX1-NEXT: vpsubq %xmm9, %xmm7, %xmm7 +; AVX1-NEXT: vpmovsxwq %xmm1, %xmm9 +; AVX1-NEXT: vpsubq %xmm9, %xmm8, %xmm8 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX1-NEXT: vpmovsxwq %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm9 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm10 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm11 +; AVX1-NEXT: vpxor %xmm12, %xmm12, %xmm12 +; AVX1-NEXT: vpsubq %xmm3, %xmm12, %xmm3 +; AVX1-NEXT: vpsubq %xmm2, %xmm12, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vblendvpd %ymm11, %ymm2, %ymm11, %ymm2 +; AVX1-NEXT: vpsubq %xmm5, %xmm12, %xmm3 +; AVX1-NEXT: vpsubq %xmm4, %xmm12, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vblendvpd %ymm10, %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vpsubq %xmm7, %xmm12, %xmm4 +; AVX1-NEXT: vpsubq %xmm6, %xmm12, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-NEXT: vblendvpd %ymm9, %ymm4, %ymm9, %ymm4 +; AVX1-NEXT: vpsubq %xmm0, %xmm12, %xmm0 +; AVX1-NEXT: vpsubq %xmm8, %xmm12, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-NEXT: vblendvpd %ymm1, %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [65535,65535,65535,65535] +; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vandpd %ymm1, %ymm4, %ymm4 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vandpd %ymm1, %ymm3, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_ext_v16i16_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm2, %ymm2 +; AVX2-NEXT: vpmovsxwq %xmm0, %ymm3 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm4, %ymm4 +; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm5, %ymm5 +; AVX2-NEXT: vpsubq %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vpmovsxwq %xmm1, %ymm5 +; AVX2-NEXT: vpsubq %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm5, %ymm5 +; AVX2-NEXT: vpsubq %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpmovsxwq %xmm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubq %ymm2, %ymm1, %ymm5 +; AVX2-NEXT: vblendvpd %ymm2, %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm5 +; AVX2-NEXT: vblendvpd %ymm3, %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vpsubq %ymm4, %ymm1, %ymm5 +; AVX2-NEXT: vblendvpd %ymm4, %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm5 +; AVX2-NEXT: vblendvpd %ymm0, %ymm5, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm1[1,2,3],ymm4[4],ymm1[5,6,7],ymm4[8],ymm1[9,10,11],ymm4[12],ymm1[13,14,15] +; AVX2-NEXT: vpackusdw %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6,7],ymm3[8],ymm1[9,10,11],ymm3[12],ymm1[13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] +; AVX2-NEXT: vpackusdw %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_ext_v16i16_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512-NEXT: vpmovsxwq %xmm2, %zmm2 +; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512-NEXT: vpmovsxwq %xmm3, %zmm3 +; AVX512-NEXT: vpsubq %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm1 +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm2, %zmm1 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpmovqw %zmm1, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq + %aext = sext <16 x i16> %a to <16 x i64> + %bext = sext <16 x i16> %b to <16 x i64> + %sub = sub <16 x i64> %aext, %bext + %abs = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %sub, i1 true) + %trunc = trunc <16 x i64> %abs to <16 x i16> + ret <16 x i16> %trunc +} + +define <8 x i32> @abd_ext_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { +; AVX1-LABEL: abd_ext_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpmovsxdq %xmm2, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpmovsxdq %xmm5, %xmm6 +; AVX1-NEXT: vpsubq %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX1-NEXT: vpmovsxdq %xmm5, %xmm5 +; AVX1-NEXT: vpsubq %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm5 +; AVX1-NEXT: vpsubq %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm5 +; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vpsubq %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vpsubq %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vblendvpd %ymm5, %ymm2, %ymm5, %ymm2 +; AVX1-NEXT: vpsubq %xmm0, %xmm6, %xmm0 +; AVX1-NEXT: vpsubq %xmm4, %xmm6, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: vblendvpd %ymm1, %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm2[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_ext_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpmovsxdq %xmm2, %ymm2 +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3 +; AVX2-NEXT: vpsubq %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubq %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm1 +; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm2[2,3] +; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_ext_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0 +; AVX512-NEXT: vpmovsxdq %ymm1, %zmm1 +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: retq + %aext = sext <8 x i32> %a to <8 x i64> + %bext = sext <8 x i32> %b to <8 x i64> + %sub = sub <8 x i64> %aext, %bext + %abs = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %sub, i1 false) + %trunc = trunc <8 x i64> %abs to <8 x i32> + ret <8 x i32> %trunc +} + +define <8 x i32> @abd_ext_v8i32_undef(<8 x i32> %a, <8 x i32> %b) nounwind { +; AVX1-LABEL: abd_ext_v8i32_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpmovsxdq %xmm2, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpmovsxdq %xmm5, %xmm6 +; AVX1-NEXT: vpsubq %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX1-NEXT: vpmovsxdq %xmm5, %xmm5 +; AVX1-NEXT: vpsubq %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm5 +; AVX1-NEXT: vpsubq %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm5 +; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vpsubq %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vpsubq %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vblendvpd %ymm5, %ymm2, %ymm5, %ymm2 +; AVX1-NEXT: vpsubq %xmm0, %xmm6, %xmm0 +; AVX1-NEXT: vpsubq %xmm4, %xmm6, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: vblendvpd %ymm1, %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm2[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_ext_v8i32_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpmovsxdq %xmm2, %ymm2 +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3 +; AVX2-NEXT: vpsubq %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubq %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm1 +; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm2[2,3] +; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_ext_v8i32_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0 +; AVX512-NEXT: vpmovsxdq %ymm1, %zmm1 +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: retq + %aext = sext <8 x i32> %a to <8 x i64> + %bext = sext <8 x i32> %b to <8 x i64> + %sub = sub <8 x i64> %aext, %bext + %abs = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %sub, i1 true) + %trunc = trunc <8 x i64> %abs to <8 x i32> + ret <8 x i32> %trunc +} + +define <4 x i64> @abd_ext_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { +; AVX1-LABEL: abd_ext_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: pushq %r15 +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: pushq %r12 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: vmovq %xmm0, %r11 +; AVX1-NEXT: movq %r11, %r10 +; AVX1-NEXT: sarq $63, %r10 +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx +; AVX1-NEXT: movq %rcx, %r9 +; AVX1-NEXT: sarq $63, %r9 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rdx +; AVX1-NEXT: movq %rdx, %r8 +; AVX1-NEXT: sarq $63, %r8 +; AVX1-NEXT: vpextrq $1, %xmm0, %rsi +; AVX1-NEXT: movq %rsi, %rdi +; AVX1-NEXT: sarq $63, %rdi +; AVX1-NEXT: vmovq %xmm1, %rbx +; AVX1-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: sarq $63, %rbx +; AVX1-NEXT: vpextrq $1, %xmm1, %r14 +; AVX1-NEXT: movq %r14, %r15 +; AVX1-NEXT: sarq $63, %r15 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %r12 +; AVX1-NEXT: movq %r12, %r13 +; AVX1-NEXT: sarq $63, %r13 +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: movq %rax, %rbp +; AVX1-NEXT: sarq $63, %rbp +; AVX1-NEXT: subq %rax, %rsi +; AVX1-NEXT: sbbq %rbp, %rdi +; AVX1-NEXT: subq %r12, %rdx +; AVX1-NEXT: sbbq %r13, %r8 +; AVX1-NEXT: subq %r14, %rcx +; AVX1-NEXT: sbbq %r15, %r9 +; AVX1-NEXT: subq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; AVX1-NEXT: sbbq %rbx, %r10 +; AVX1-NEXT: sarq $63, %r10 +; AVX1-NEXT: xorq %r10, %r11 +; AVX1-NEXT: subq %r10, %r11 +; AVX1-NEXT: sarq $63, %r9 +; AVX1-NEXT: xorq %r9, %rcx +; AVX1-NEXT: subq %r9, %rcx +; AVX1-NEXT: sarq $63, %r8 +; AVX1-NEXT: xorq %r8, %rdx +; AVX1-NEXT: subq %r8, %rdx +; AVX1-NEXT: sarq $63, %rdi +; AVX1-NEXT: xorq %rdi, %rsi +; AVX1-NEXT: subq %rdi, %rsi +; AVX1-NEXT: vmovq %rsi, %xmm0 +; AVX1-NEXT: vmovq %rdx, %xmm1 +; AVX1-NEXT: vmovq %rcx, %xmm2 +; AVX1-NEXT: vmovq %r11, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: popq %r13 +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_ext_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: vmovq %xmm0, %r11 +; AVX2-NEXT: movq %r11, %r10 +; AVX2-NEXT: sarq $63, %r10 +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: movq %rcx, %r9 +; AVX2-NEXT: sarq $63, %r9 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rdx +; AVX2-NEXT: movq %rdx, %r8 +; AVX2-NEXT: sarq $63, %r8 +; AVX2-NEXT: vpextrq $1, %xmm0, %rsi +; AVX2-NEXT: movq %rsi, %rdi +; AVX2-NEXT: sarq $63, %rdi +; AVX2-NEXT: vmovq %xmm1, %rbx +; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: sarq $63, %rbx +; AVX2-NEXT: vpextrq $1, %xmm1, %r14 +; AVX2-NEXT: movq %r14, %r15 +; AVX2-NEXT: sarq $63, %r15 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %r12 +; AVX2-NEXT: movq %r12, %r13 +; AVX2-NEXT: sarq $63, %r13 +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: movq %rax, %rbp +; AVX2-NEXT: sarq $63, %rbp +; AVX2-NEXT: subq %rax, %rsi +; AVX2-NEXT: sbbq %rbp, %rdi +; AVX2-NEXT: subq %r12, %rdx +; AVX2-NEXT: sbbq %r13, %r8 +; AVX2-NEXT: subq %r14, %rcx +; AVX2-NEXT: sbbq %r15, %r9 +; AVX2-NEXT: subq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; AVX2-NEXT: sbbq %rbx, %r10 +; AVX2-NEXT: sarq $63, %r10 +; AVX2-NEXT: xorq %r10, %r11 +; AVX2-NEXT: subq %r10, %r11 +; AVX2-NEXT: sarq $63, %r9 +; AVX2-NEXT: xorq %r9, %rcx +; AVX2-NEXT: subq %r9, %rcx +; AVX2-NEXT: sarq $63, %r8 +; AVX2-NEXT: xorq %r8, %rdx +; AVX2-NEXT: subq %r8, %rdx +; AVX2-NEXT: sarq $63, %rdi +; AVX2-NEXT: xorq %rdi, %rsi +; AVX2-NEXT: subq %rdi, %rsi +; AVX2-NEXT: vmovq %rsi, %xmm0 +; AVX2-NEXT: vmovq %rdx, %xmm1 +; AVX2-NEXT: vmovq %rcx, %xmm2 +; AVX2-NEXT: vmovq %r11, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_ext_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: vmovq %xmm0, %r11 +; AVX512-NEXT: movq %r11, %r10 +; AVX512-NEXT: sarq $63, %r10 +; AVX512-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512-NEXT: movq %rcx, %r9 +; AVX512-NEXT: sarq $63, %r9 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rdx +; AVX512-NEXT: movq %rdx, %r8 +; AVX512-NEXT: sarq $63, %r8 +; AVX512-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512-NEXT: movq %rsi, %rdi +; AVX512-NEXT: sarq $63, %rdi +; AVX512-NEXT: vmovq %xmm1, %rbx +; AVX512-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: sarq $63, %rbx +; AVX512-NEXT: vpextrq $1, %xmm1, %r14 +; AVX512-NEXT: movq %r14, %r15 +; AVX512-NEXT: sarq $63, %r15 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %r12 +; AVX512-NEXT: movq %r12, %r13 +; AVX512-NEXT: sarq $63, %r13 +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: movq %rax, %rbp +; AVX512-NEXT: sarq $63, %rbp +; AVX512-NEXT: subq %rax, %rsi +; AVX512-NEXT: sbbq %rbp, %rdi +; AVX512-NEXT: subq %r12, %rdx +; AVX512-NEXT: sbbq %r13, %r8 +; AVX512-NEXT: subq %r14, %rcx +; AVX512-NEXT: sbbq %r15, %r9 +; AVX512-NEXT: subq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; AVX512-NEXT: sbbq %rbx, %r10 +; AVX512-NEXT: sarq $63, %r10 +; AVX512-NEXT: xorq %r10, %r11 +; AVX512-NEXT: subq %r10, %r11 +; AVX512-NEXT: sarq $63, %r9 +; AVX512-NEXT: xorq %r9, %rcx +; AVX512-NEXT: subq %r9, %rcx +; AVX512-NEXT: sarq $63, %r8 +; AVX512-NEXT: xorq %r8, %rdx +; AVX512-NEXT: subq %r8, %rdx +; AVX512-NEXT: sarq $63, %rdi +; AVX512-NEXT: xorq %rdi, %rsi +; AVX512-NEXT: subq %rdi, %rsi +; AVX512-NEXT: vmovq %rsi, %xmm0 +; AVX512-NEXT: vmovq %rdx, %xmm1 +; AVX512-NEXT: vmovq %rcx, %xmm2 +; AVX512-NEXT: vmovq %r11, %xmm3 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq + %aext = sext <4 x i64> %a to <4 x i128> + %bext = sext <4 x i64> %b to <4 x i128> + %sub = sub <4 x i128> %aext, %bext + %abs = call <4 x i128> @llvm.abs.v4i128(<4 x i128> %sub, i1 false) + %trunc = trunc <4 x i128> %abs to <4 x i64> + ret <4 x i64> %trunc +} + +define <4 x i64> @abd_ext_v4i64_undef(<4 x i64> %a, <4 x i64> %b) nounwind { +; AVX1-LABEL: abd_ext_v4i64_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: pushq %r15 +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: pushq %r12 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: vmovq %xmm0, %r11 +; AVX1-NEXT: movq %r11, %r10 +; AVX1-NEXT: sarq $63, %r10 +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx +; AVX1-NEXT: movq %rcx, %r9 +; AVX1-NEXT: sarq $63, %r9 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rdx +; AVX1-NEXT: movq %rdx, %r8 +; AVX1-NEXT: sarq $63, %r8 +; AVX1-NEXT: vpextrq $1, %xmm0, %rsi +; AVX1-NEXT: movq %rsi, %rdi +; AVX1-NEXT: sarq $63, %rdi +; AVX1-NEXT: vmovq %xmm1, %rbx +; AVX1-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: sarq $63, %rbx +; AVX1-NEXT: vpextrq $1, %xmm1, %r14 +; AVX1-NEXT: movq %r14, %r15 +; AVX1-NEXT: sarq $63, %r15 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %r12 +; AVX1-NEXT: movq %r12, %r13 +; AVX1-NEXT: sarq $63, %r13 +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: movq %rax, %rbp +; AVX1-NEXT: sarq $63, %rbp +; AVX1-NEXT: subq %rax, %rsi +; AVX1-NEXT: sbbq %rbp, %rdi +; AVX1-NEXT: subq %r12, %rdx +; AVX1-NEXT: sbbq %r13, %r8 +; AVX1-NEXT: subq %r14, %rcx +; AVX1-NEXT: sbbq %r15, %r9 +; AVX1-NEXT: subq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; AVX1-NEXT: sbbq %rbx, %r10 +; AVX1-NEXT: sarq $63, %r10 +; AVX1-NEXT: xorq %r10, %r11 +; AVX1-NEXT: subq %r10, %r11 +; AVX1-NEXT: sarq $63, %r9 +; AVX1-NEXT: xorq %r9, %rcx +; AVX1-NEXT: subq %r9, %rcx +; AVX1-NEXT: sarq $63, %r8 +; AVX1-NEXT: xorq %r8, %rdx +; AVX1-NEXT: subq %r8, %rdx +; AVX1-NEXT: sarq $63, %rdi +; AVX1-NEXT: xorq %rdi, %rsi +; AVX1-NEXT: subq %rdi, %rsi +; AVX1-NEXT: vmovq %rsi, %xmm0 +; AVX1-NEXT: vmovq %rdx, %xmm1 +; AVX1-NEXT: vmovq %rcx, %xmm2 +; AVX1-NEXT: vmovq %r11, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: popq %r13 +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_ext_v4i64_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: vmovq %xmm0, %r11 +; AVX2-NEXT: movq %r11, %r10 +; AVX2-NEXT: sarq $63, %r10 +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: movq %rcx, %r9 +; AVX2-NEXT: sarq $63, %r9 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rdx +; AVX2-NEXT: movq %rdx, %r8 +; AVX2-NEXT: sarq $63, %r8 +; AVX2-NEXT: vpextrq $1, %xmm0, %rsi +; AVX2-NEXT: movq %rsi, %rdi +; AVX2-NEXT: sarq $63, %rdi +; AVX2-NEXT: vmovq %xmm1, %rbx +; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: sarq $63, %rbx +; AVX2-NEXT: vpextrq $1, %xmm1, %r14 +; AVX2-NEXT: movq %r14, %r15 +; AVX2-NEXT: sarq $63, %r15 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %r12 +; AVX2-NEXT: movq %r12, %r13 +; AVX2-NEXT: sarq $63, %r13 +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: movq %rax, %rbp +; AVX2-NEXT: sarq $63, %rbp +; AVX2-NEXT: subq %rax, %rsi +; AVX2-NEXT: sbbq %rbp, %rdi +; AVX2-NEXT: subq %r12, %rdx +; AVX2-NEXT: sbbq %r13, %r8 +; AVX2-NEXT: subq %r14, %rcx +; AVX2-NEXT: sbbq %r15, %r9 +; AVX2-NEXT: subq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; AVX2-NEXT: sbbq %rbx, %r10 +; AVX2-NEXT: sarq $63, %r10 +; AVX2-NEXT: xorq %r10, %r11 +; AVX2-NEXT: subq %r10, %r11 +; AVX2-NEXT: sarq $63, %r9 +; AVX2-NEXT: xorq %r9, %rcx +; AVX2-NEXT: subq %r9, %rcx +; AVX2-NEXT: sarq $63, %r8 +; AVX2-NEXT: xorq %r8, %rdx +; AVX2-NEXT: subq %r8, %rdx +; AVX2-NEXT: sarq $63, %rdi +; AVX2-NEXT: xorq %rdi, %rsi +; AVX2-NEXT: subq %rdi, %rsi +; AVX2-NEXT: vmovq %rsi, %xmm0 +; AVX2-NEXT: vmovq %rdx, %xmm1 +; AVX2-NEXT: vmovq %rcx, %xmm2 +; AVX2-NEXT: vmovq %r11, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_ext_v4i64_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: vmovq %xmm0, %r11 +; AVX512-NEXT: movq %r11, %r10 +; AVX512-NEXT: sarq $63, %r10 +; AVX512-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512-NEXT: movq %rcx, %r9 +; AVX512-NEXT: sarq $63, %r9 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rdx +; AVX512-NEXT: movq %rdx, %r8 +; AVX512-NEXT: sarq $63, %r8 +; AVX512-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512-NEXT: movq %rsi, %rdi +; AVX512-NEXT: sarq $63, %rdi +; AVX512-NEXT: vmovq %xmm1, %rbx +; AVX512-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: sarq $63, %rbx +; AVX512-NEXT: vpextrq $1, %xmm1, %r14 +; AVX512-NEXT: movq %r14, %r15 +; AVX512-NEXT: sarq $63, %r15 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %r12 +; AVX512-NEXT: movq %r12, %r13 +; AVX512-NEXT: sarq $63, %r13 +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: movq %rax, %rbp +; AVX512-NEXT: sarq $63, %rbp +; AVX512-NEXT: subq %rax, %rsi +; AVX512-NEXT: sbbq %rbp, %rdi +; AVX512-NEXT: subq %r12, %rdx +; AVX512-NEXT: sbbq %r13, %r8 +; AVX512-NEXT: subq %r14, %rcx +; AVX512-NEXT: sbbq %r15, %r9 +; AVX512-NEXT: subq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; AVX512-NEXT: sbbq %rbx, %r10 +; AVX512-NEXT: sarq $63, %r10 +; AVX512-NEXT: xorq %r10, %r11 +; AVX512-NEXT: subq %r10, %r11 +; AVX512-NEXT: sarq $63, %r9 +; AVX512-NEXT: xorq %r9, %rcx +; AVX512-NEXT: subq %r9, %rcx +; AVX512-NEXT: sarq $63, %r8 +; AVX512-NEXT: xorq %r8, %rdx +; AVX512-NEXT: subq %r8, %rdx +; AVX512-NEXT: sarq $63, %rdi +; AVX512-NEXT: xorq %rdi, %rsi +; AVX512-NEXT: subq %rdi, %rsi +; AVX512-NEXT: vmovq %rsi, %xmm0 +; AVX512-NEXT: vmovq %rdx, %xmm1 +; AVX512-NEXT: vmovq %rcx, %xmm2 +; AVX512-NEXT: vmovq %r11, %xmm3 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq + %aext = sext <4 x i64> %a to <4 x i128> + %bext = sext <4 x i64> %b to <4 x i128> + %sub = sub <4 x i128> %aext, %bext + %abs = call <4 x i128> @llvm.abs.v4i128(<4 x i128> %sub, i1 true) + %trunc = trunc <4 x i128> %abs to <4 x i64> + ret <4 x i64> %trunc +} + +; +; sub(smax(a,b),smin(a,b)) -> abds(a,b) +; + +define <32 x i8> @abd_minmax_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { +; AVX1-LABEL: abd_minmax_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpminsb %xmm3, %xmm4, %xmm5 +; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsb %xmm3, %xmm4, %xmm1 +; AVX1-NEXT: vpsubb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_minmax_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsubb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_minmax_v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm2 +; AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsubb %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: retq + %min = call <32 x i8> @llvm.smin.v32i8(<32 x i8> %a, <32 x i8> %b) + %max = call <32 x i8> @llvm.smax.v32i8(<32 x i8> %a, <32 x i8> %b) + %sub = sub <32 x i8> %max, %min + ret <32 x i8> %sub +} + +define <16 x i16> @abd_minmax_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { +; AVX1-LABEL: abd_minmax_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpminsw %xmm3, %xmm4, %xmm5 +; AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsw %xmm3, %xmm4, %xmm1 +; AVX1-NEXT: vpsubw %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_minmax_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsubw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_minmax_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm2 +; AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsubw %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: retq + %min = call <16 x i16> @llvm.smin.v16i16(<16 x i16> %a, <16 x i16> %b) + %max = call <16 x i16> @llvm.smax.v16i16(<16 x i16> %a, <16 x i16> %b) + %sub = sub <16 x i16> %max, %min + ret <16 x i16> %sub +} + +define <8 x i32> @abd_minmax_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { +; AVX1-LABEL: abd_minmax_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpminsd %xmm3, %xmm4, %xmm5 +; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsd %xmm3, %xmm4, %xmm1 +; AVX1-NEXT: vpsubd %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_minmax_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_minmax_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm2 +; AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsubd %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: retq + %min = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %a, <8 x i32> %b) + %max = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %a, <8 x i32> %b) + %sub = sub <8 x i32> %max, %min + ret <8 x i32> %sub +} + +define <4 x i64> @abd_minmax_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { +; AVX1-LABEL: abd_minmax_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm6 +; AVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm5, %xmm7 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm4, %xmm1 +; AVX1-NEXT: vpsubq %xmm7, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_minmax_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm3 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpsubq %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_minmax_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminsq %ymm1, %ymm0, %ymm2 +; AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsubq %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: retq + %min = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %a, <4 x i64> %b) + %max = call <4 x i64> @llvm.smax.v4i64(<4 x i64> %a, <4 x i64> %b) + %sub = sub <4 x i64> %max, %min + ret <4 x i64> %sub +} + +; +; abs(sub_nsw(x, y)) -> abds(a,b) +; + +define <32 x i8> @abd_subnsw_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { +; AVX1-LABEL: abd_subnsw_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpabsb %xmm0, %xmm0 +; AVX1-NEXT: vpabsb %xmm2, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_subnsw_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpabsb %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_subnsw_v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpabsb %ymm0, %ymm0 +; AVX512-NEXT: retq + %sub = sub nsw <32 x i8> %a, %b + %abs = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %sub, i1 false) + ret <32 x i8> %abs +} + +define <16 x i16> @abd_subnsw_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { +; AVX1-LABEL: abd_subnsw_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpabsw %xmm0, %xmm0 +; AVX1-NEXT: vpabsw %xmm2, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_subnsw_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpabsw %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_subnsw_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpabsw %ymm0, %ymm0 +; AVX512-NEXT: retq + %sub = sub nsw <16 x i16> %a, %b + %abs = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %sub, i1 false) + ret <16 x i16> %abs +} + +define <8 x i32> @abd_subnsw_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { +; AVX1-LABEL: abd_subnsw_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpabsd %xmm0, %xmm0 +; AVX1-NEXT: vpabsd %xmm2, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_subnsw_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpabsd %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_subnsw_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpabsd %ymm0, %ymm0 +; AVX512-NEXT: retq + %sub = sub nsw <8 x i32> %a, %b + %abs = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %sub, i1 false) + ret <8 x i32> %abs +} + +define <4 x i64> @abd_subnsw_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { +; AVX1-LABEL: abd_subnsw_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsubq %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vblendvpd %ymm1, %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_subnsw_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm1 +; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_subnsw_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpabsq %ymm0, %ymm0 +; AVX512-NEXT: retq + %sub = sub nsw <4 x i64> %a, %b + %abs = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %sub, i1 false) + ret <4 x i64> %abs +} + +declare <32 x i8> @llvm.abs.v32i8(<32 x i8>, i1) +declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1) +declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1) +declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1) +declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1) +declare <16 x i64> @llvm.abs.v16i64(<16 x i64>, i1) +declare <32 x i64> @llvm.abs.v32i64(<32 x i64>, i1) +declare <4 x i128> @llvm.abs.v4i128(<4 x i128>, i1) + +declare <32 x i8> @llvm.smax.v32i8(<32 x i8>, <32 x i8>) +declare <16 x i16> @llvm.smax.v16i16(<16 x i16>, <16 x i16>) +declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>) +declare <4 x i64> @llvm.smax.v4i64(<4 x i64>, <4 x i64>) + +declare <32 x i8> @llvm.smin.v32i8(<32 x i8>, <32 x i8>) +declare <16 x i16> @llvm.smin.v16i16(<16 x i16>, <16 x i16>) +declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>) +declare <4 x i64> @llvm.smin.v4i64(<4 x i64>, <4 x i64>) diff --git a/llvm/test/CodeGen/X86/abds-vector-512.ll b/llvm/test/CodeGen/X86/abds-vector-512.ll new file mode 100644 index 0000000..0f6d37b --- /dev/null +++ b/llvm/test/CodeGen/X86/abds-vector-512.ll @@ -0,0 +1,768 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ + +; +; trunc(abs(sub(sext(a),sext(b)))) -> abds(a,b) +; + +define <64 x i8> @abd_ext_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { +; AVX512-LABEL: abd_ext_v64i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxbw %xmm0, %ymm2 +; AVX512-NEXT: vpmovsxwq %xmm2, %zmm3 +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vpmovsxwq %xmm2, %zmm2 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512-NEXT: vpmovsxbw %xmm4, %ymm4 +; AVX512-NEXT: vpmovsxwq %xmm4, %zmm5 +; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512-NEXT: vpmovsxwq %xmm4, %zmm4 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512-NEXT: vpmovsxbw %xmm0, %ymm6 +; AVX512-NEXT: vpmovsxwq %xmm6, %zmm7 +; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512-NEXT: vpmovsxwq %xmm6, %zmm6 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512-NEXT: vpmovsxwq %xmm0, %zmm8 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512-NEXT: vpmovsxbw %xmm1, %ymm9 +; AVX512-NEXT: vpmovsxwq %xmm9, %zmm10 +; AVX512-NEXT: vpsubq %zmm10, %zmm3, %zmm3 +; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512-NEXT: vpmovsxwq %xmm9, %zmm9 +; AVX512-NEXT: vpsubq %zmm9, %zmm2, %zmm2 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm9 +; AVX512-NEXT: vpmovsxbw %xmm9, %ymm9 +; AVX512-NEXT: vpmovsxwq %xmm9, %zmm10 +; AVX512-NEXT: vpsubq %zmm10, %zmm5, %zmm5 +; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512-NEXT: vpmovsxwq %xmm9, %zmm9 +; AVX512-NEXT: vpsubq %zmm9, %zmm4, %zmm4 +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512-NEXT: vpmovsxbw %xmm1, %ymm9 +; AVX512-NEXT: vpmovsxwq %xmm9, %zmm10 +; AVX512-NEXT: vpsubq %zmm10, %zmm7, %zmm7 +; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512-NEXT: vpmovsxwq %xmm9, %zmm9 +; AVX512-NEXT: vpsubq %zmm9, %zmm6, %zmm6 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpmovsxbw %xmm1, %ymm1 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm9 +; AVX512-NEXT: vpsubq %zmm9, %zmm8, %zmm8 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm1 +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm3, %zmm1 +; AVX512-NEXT: vpabsq %zmm2, %zmm2 +; AVX512-NEXT: vpabsq %zmm5, %zmm3 +; AVX512-NEXT: vpabsq %zmm4, %zmm4 +; AVX512-NEXT: vpabsq %zmm7, %zmm5 +; AVX512-NEXT: vpabsq %zmm6, %zmm6 +; AVX512-NEXT: vpabsq %zmm8, %zmm7 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpmovqb %zmm7, %xmm7 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm7[0],xmm0[0] +; AVX512-NEXT: vpmovqb %zmm6, %xmm6 +; AVX512-NEXT: vpmovqb %zmm5, %xmm5 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 +; AVX512-NEXT: vpmovqb %zmm4, %xmm4 +; AVX512-NEXT: vpmovqb %zmm3, %xmm3 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; AVX512-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: retq + %aext = sext <64 x i8> %a to <64 x i64> + %bext = sext <64 x i8> %b to <64 x i64> + %sub = sub <64 x i64> %aext, %bext + %abs = call <64 x i64> @llvm.abs.v64i64(<64 x i64> %sub, i1 false) + %trunc = trunc <64 x i64> %abs to <64 x i8> + ret <64 x i8> %trunc +} + +define <64 x i8> @abd_ext_v64i8_undef(<64 x i8> %a, <64 x i8> %b) nounwind { +; AVX512-LABEL: abd_ext_v64i8_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxbw %xmm0, %ymm2 +; AVX512-NEXT: vpmovsxwq %xmm2, %zmm3 +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vpmovsxwq %xmm2, %zmm2 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512-NEXT: vpmovsxbw %xmm4, %ymm4 +; AVX512-NEXT: vpmovsxwq %xmm4, %zmm5 +; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512-NEXT: vpmovsxwq %xmm4, %zmm4 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512-NEXT: vpmovsxbw %xmm0, %ymm6 +; AVX512-NEXT: vpmovsxwq %xmm6, %zmm7 +; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512-NEXT: vpmovsxwq %xmm6, %zmm6 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512-NEXT: vpmovsxwq %xmm0, %zmm8 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512-NEXT: vpmovsxbw %xmm1, %ymm9 +; AVX512-NEXT: vpmovsxwq %xmm9, %zmm10 +; AVX512-NEXT: vpsubq %zmm10, %zmm3, %zmm3 +; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512-NEXT: vpmovsxwq %xmm9, %zmm9 +; AVX512-NEXT: vpsubq %zmm9, %zmm2, %zmm2 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm9 +; AVX512-NEXT: vpmovsxbw %xmm9, %ymm9 +; AVX512-NEXT: vpmovsxwq %xmm9, %zmm10 +; AVX512-NEXT: vpsubq %zmm10, %zmm5, %zmm5 +; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512-NEXT: vpmovsxwq %xmm9, %zmm9 +; AVX512-NEXT: vpsubq %zmm9, %zmm4, %zmm4 +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512-NEXT: vpmovsxbw %xmm1, %ymm9 +; AVX512-NEXT: vpmovsxwq %xmm9, %zmm10 +; AVX512-NEXT: vpsubq %zmm10, %zmm7, %zmm7 +; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512-NEXT: vpmovsxwq %xmm9, %zmm9 +; AVX512-NEXT: vpsubq %zmm9, %zmm6, %zmm6 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpmovsxbw %xmm1, %ymm1 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm9 +; AVX512-NEXT: vpsubq %zmm9, %zmm8, %zmm8 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm1 +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm3, %zmm1 +; AVX512-NEXT: vpabsq %zmm2, %zmm2 +; AVX512-NEXT: vpabsq %zmm5, %zmm3 +; AVX512-NEXT: vpabsq %zmm4, %zmm4 +; AVX512-NEXT: vpabsq %zmm7, %zmm5 +; AVX512-NEXT: vpabsq %zmm6, %zmm6 +; AVX512-NEXT: vpabsq %zmm8, %zmm7 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpmovqb %zmm7, %xmm7 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm7[0],xmm0[0] +; AVX512-NEXT: vpmovqb %zmm6, %xmm6 +; AVX512-NEXT: vpmovqb %zmm5, %xmm5 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 +; AVX512-NEXT: vpmovqb %zmm4, %xmm4 +; AVX512-NEXT: vpmovqb %zmm3, %xmm3 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; AVX512-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: retq + %aext = sext <64 x i8> %a to <64 x i64> + %bext = sext <64 x i8> %b to <64 x i64> + %sub = sub <64 x i64> %aext, %bext + %abs = call <64 x i64> @llvm.abs.v64i64(<64 x i64> %sub, i1 true) + %trunc = trunc <64 x i64> %abs to <64 x i8> + ret <64 x i8> %trunc +} + +define <32 x i16> @abd_ext_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { +; AVX512-LABEL: abd_ext_v32i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512-NEXT: vpmovsxwq %xmm2, %zmm2 +; AVX512-NEXT: vpmovsxwq %xmm0, %zmm3 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512-NEXT: vpmovsxwq %xmm4, %zmm4 +; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512-NEXT: vpmovsxwq %xmm5, %zmm5 +; AVX512-NEXT: vpsubq %zmm5, %zmm2, %zmm2 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm5 +; AVX512-NEXT: vpsubq %zmm5, %zmm3, %zmm3 +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512-NEXT: vpmovsxwq %xmm5, %zmm5 +; AVX512-NEXT: vpsubq %zmm5, %zmm4, %zmm4 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm1 +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm2, %zmm1 +; AVX512-NEXT: vpabsq %zmm3, %zmm2 +; AVX512-NEXT: vpabsq %zmm4, %zmm3 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpmovqw %zmm3, %xmm3 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX512-NEXT: vpmovqw %zmm2, %xmm2 +; AVX512-NEXT: vpmovqw %zmm1, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: retq + %aext = sext <32 x i16> %a to <32 x i64> + %bext = sext <32 x i16> %b to <32 x i64> + %sub = sub <32 x i64> %aext, %bext + %abs = call <32 x i64> @llvm.abs.v32i64(<32 x i64> %sub, i1 false) + %trunc = trunc <32 x i64> %abs to <32 x i16> + ret <32 x i16> %trunc +} + +define <32 x i16> @abd_ext_v32i16_undef(<32 x i16> %a, <32 x i16> %b) nounwind { +; AVX512-LABEL: abd_ext_v32i16_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512-NEXT: vpmovsxwq %xmm2, %zmm2 +; AVX512-NEXT: vpmovsxwq %xmm0, %zmm3 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512-NEXT: vpmovsxwq %xmm4, %zmm4 +; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512-NEXT: vpmovsxwq %xmm5, %zmm5 +; AVX512-NEXT: vpsubq %zmm5, %zmm2, %zmm2 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm5 +; AVX512-NEXT: vpsubq %zmm5, %zmm3, %zmm3 +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512-NEXT: vpmovsxwq %xmm5, %zmm5 +; AVX512-NEXT: vpsubq %zmm5, %zmm4, %zmm4 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm1 +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm2, %zmm1 +; AVX512-NEXT: vpabsq %zmm3, %zmm2 +; AVX512-NEXT: vpabsq %zmm4, %zmm3 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpmovqw %zmm3, %xmm3 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX512-NEXT: vpmovqw %zmm2, %xmm2 +; AVX512-NEXT: vpmovqw %zmm1, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: retq + %aext = sext <32 x i16> %a to <32 x i64> + %bext = sext <32 x i16> %b to <32 x i64> + %sub = sub <32 x i64> %aext, %bext + %abs = call <32 x i64> @llvm.abs.v32i64(<32 x i64> %sub, i1 true) + %trunc = trunc <32 x i64> %abs to <32 x i16> + ret <32 x i16> %trunc +} + +define <16 x i32> @abd_ext_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { +; AVX512-LABEL: abd_ext_v16i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512-NEXT: vpmovsxdq %ymm2, %zmm2 +; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512-NEXT: vpmovsxdq %ymm3, %zmm3 +; AVX512-NEXT: vpsubq %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vpmovsxdq %ymm1, %zmm1 +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm2, %zmm1 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %aext = sext <16 x i32> %a to <16 x i64> + %bext = sext <16 x i32> %b to <16 x i64> + %sub = sub <16 x i64> %aext, %bext + %abs = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %sub, i1 false) + %trunc = trunc <16 x i64> %abs to <16 x i32> + ret <16 x i32> %trunc +} + +define <16 x i32> @abd_ext_v16i32_undef(<16 x i32> %a, <16 x i32> %b) nounwind { +; AVX512-LABEL: abd_ext_v16i32_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512-NEXT: vpmovsxdq %ymm2, %zmm2 +; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512-NEXT: vpmovsxdq %ymm3, %zmm3 +; AVX512-NEXT: vpsubq %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vpmovsxdq %ymm1, %zmm1 +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm2, %zmm1 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %aext = sext <16 x i32> %a to <16 x i64> + %bext = sext <16 x i32> %b to <16 x i64> + %sub = sub <16 x i64> %aext, %bext + %abs = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %sub, i1 true) + %trunc = trunc <16 x i64> %abs to <16 x i32> + ret <16 x i32> %trunc +} + +define <8 x i64> @abd_ext_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { +; AVX512-LABEL: abd_ext_v8i64: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: pushq %rax +; AVX512-NEXT: vmovq %xmm0, %r9 +; AVX512-NEXT: movq %r9, (%rsp) # 8-byte Spill +; AVX512-NEXT: sarq $63, %r9 +; AVX512-NEXT: vpextrq $1, %xmm0, %rbp +; AVX512-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: sarq $63, %rbp +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512-NEXT: vmovq %xmm2, %r13 +; AVX512-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: sarq $63, %r13 +; AVX512-NEXT: vpextrq $1, %xmm2, %r12 +; AVX512-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: sarq $63, %r12 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512-NEXT: vmovq %xmm0, %r15 +; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: sarq $63, %r15 +; AVX512-NEXT: vpextrq $1, %xmm0, %r14 +; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: sarq $63, %r14 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rbx +; AVX512-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: sarq $63, %rbx +; AVX512-NEXT: vpextrq $1, %xmm0, %r11 +; AVX512-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: sarq $63, %r11 +; AVX512-NEXT: vmovq %xmm1, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: sarq $63, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vpextrq $1, %xmm1, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: sarq $63, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: sarq $63, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: sarq $63, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm0 +; AVX512-NEXT: vmovq %xmm0, %rdi +; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: sarq $63, %rdi +; AVX512-NEXT: vpextrq $1, %xmm0, %r10 +; AVX512-NEXT: movq %r10, %r8 +; AVX512-NEXT: sarq $63, %r8 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rsi +; AVX512-NEXT: movq %rsi, %rax +; AVX512-NEXT: sarq $63, %rax +; AVX512-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512-NEXT: movq %rdx, %rcx +; AVX512-NEXT: sarq $63, %rcx +; AVX512-NEXT: subq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: sbbq %rcx, %r11 +; AVX512-NEXT: subq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: sbbq %rax, %rbx +; AVX512-NEXT: subq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: sbbq %r8, %r14 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: subq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload +; AVX512-NEXT: sbbq %rdi, %r15 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX512-NEXT: subq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; AVX512-NEXT: sbbq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AVX512-NEXT: subq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload +; AVX512-NEXT: sbbq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX512-NEXT: subq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; AVX512-NEXT: sbbq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload +; AVX512-NEXT: movq (%rsp), %rax # 8-byte Reload +; AVX512-NEXT: subq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; AVX512-NEXT: sbbq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; AVX512-NEXT: sarq $63, %r9 +; AVX512-NEXT: xorq %r9, %rax +; AVX512-NEXT: subq %r9, %rax +; AVX512-NEXT: sarq $63, %rbp +; AVX512-NEXT: xorq %rbp, %rcx +; AVX512-NEXT: subq %rbp, %rcx +; AVX512-NEXT: sarq $63, %r13 +; AVX512-NEXT: xorq %r13, %rdx +; AVX512-NEXT: subq %r13, %rdx +; AVX512-NEXT: sarq $63, %r12 +; AVX512-NEXT: xorq %r12, %rsi +; AVX512-NEXT: subq %r12, %rsi +; AVX512-NEXT: sarq $63, %r15 +; AVX512-NEXT: movq %r8, %rdi +; AVX512-NEXT: xorq %r15, %rdi +; AVX512-NEXT: subq %r15, %rdi +; AVX512-NEXT: sarq $63, %r14 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: xorq %r14, %r8 +; AVX512-NEXT: subq %r14, %r8 +; AVX512-NEXT: sarq $63, %rbx +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; AVX512-NEXT: xorq %rbx, %r9 +; AVX512-NEXT: subq %rbx, %r9 +; AVX512-NEXT: sarq $63, %r11 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX512-NEXT: xorq %r11, %r10 +; AVX512-NEXT: subq %r11, %r10 +; AVX512-NEXT: vmovq %r10, %xmm0 +; AVX512-NEXT: vmovq %r9, %xmm1 +; AVX512-NEXT: vmovq %r8, %xmm2 +; AVX512-NEXT: vmovq %rdi, %xmm3 +; AVX512-NEXT: vmovq %rsi, %xmm4 +; AVX512-NEXT: vmovq %rdx, %xmm5 +; AVX512-NEXT: vmovq %rcx, %xmm6 +; AVX512-NEXT: vmovq %rax, %xmm7 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0] +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: addq $8, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq + %aext = sext <8 x i64> %a to <8 x i128> + %bext = sext <8 x i64> %b to <8 x i128> + %sub = sub <8 x i128> %aext, %bext + %abs = call <8 x i128> @llvm.abs.v8i128(<8 x i128> %sub, i1 false) + %trunc = trunc <8 x i128> %abs to <8 x i64> + ret <8 x i64> %trunc +} + +define <8 x i64> @abd_ext_v8i64_undef(<8 x i64> %a, <8 x i64> %b) nounwind { +; AVX512-LABEL: abd_ext_v8i64_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: pushq %rax +; AVX512-NEXT: vmovq %xmm0, %r9 +; AVX512-NEXT: movq %r9, (%rsp) # 8-byte Spill +; AVX512-NEXT: sarq $63, %r9 +; AVX512-NEXT: vpextrq $1, %xmm0, %rbp +; AVX512-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: sarq $63, %rbp +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512-NEXT: vmovq %xmm2, %r13 +; AVX512-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: sarq $63, %r13 +; AVX512-NEXT: vpextrq $1, %xmm2, %r12 +; AVX512-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: sarq $63, %r12 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512-NEXT: vmovq %xmm0, %r15 +; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: sarq $63, %r15 +; AVX512-NEXT: vpextrq $1, %xmm0, %r14 +; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: sarq $63, %r14 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rbx +; AVX512-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: sarq $63, %rbx +; AVX512-NEXT: vpextrq $1, %xmm0, %r11 +; AVX512-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: sarq $63, %r11 +; AVX512-NEXT: vmovq %xmm1, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: sarq $63, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vpextrq $1, %xmm1, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: sarq $63, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: sarq $63, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: sarq $63, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm0 +; AVX512-NEXT: vmovq %xmm0, %rdi +; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: sarq $63, %rdi +; AVX512-NEXT: vpextrq $1, %xmm0, %r10 +; AVX512-NEXT: movq %r10, %r8 +; AVX512-NEXT: sarq $63, %r8 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rsi +; AVX512-NEXT: movq %rsi, %rax +; AVX512-NEXT: sarq $63, %rax +; AVX512-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512-NEXT: movq %rdx, %rcx +; AVX512-NEXT: sarq $63, %rcx +; AVX512-NEXT: subq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: sbbq %rcx, %r11 +; AVX512-NEXT: subq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: sbbq %rax, %rbx +; AVX512-NEXT: subq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: sbbq %r8, %r14 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: subq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload +; AVX512-NEXT: sbbq %rdi, %r15 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX512-NEXT: subq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; AVX512-NEXT: sbbq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AVX512-NEXT: subq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload +; AVX512-NEXT: sbbq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX512-NEXT: subq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; AVX512-NEXT: sbbq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload +; AVX512-NEXT: movq (%rsp), %rax # 8-byte Reload +; AVX512-NEXT: subq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; AVX512-NEXT: sbbq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; AVX512-NEXT: sarq $63, %r9 +; AVX512-NEXT: xorq %r9, %rax +; AVX512-NEXT: subq %r9, %rax +; AVX512-NEXT: sarq $63, %rbp +; AVX512-NEXT: xorq %rbp, %rcx +; AVX512-NEXT: subq %rbp, %rcx +; AVX512-NEXT: sarq $63, %r13 +; AVX512-NEXT: xorq %r13, %rdx +; AVX512-NEXT: subq %r13, %rdx +; AVX512-NEXT: sarq $63, %r12 +; AVX512-NEXT: xorq %r12, %rsi +; AVX512-NEXT: subq %r12, %rsi +; AVX512-NEXT: sarq $63, %r15 +; AVX512-NEXT: movq %r8, %rdi +; AVX512-NEXT: xorq %r15, %rdi +; AVX512-NEXT: subq %r15, %rdi +; AVX512-NEXT: sarq $63, %r14 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: xorq %r14, %r8 +; AVX512-NEXT: subq %r14, %r8 +; AVX512-NEXT: sarq $63, %rbx +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; AVX512-NEXT: xorq %rbx, %r9 +; AVX512-NEXT: subq %rbx, %r9 +; AVX512-NEXT: sarq $63, %r11 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX512-NEXT: xorq %r11, %r10 +; AVX512-NEXT: subq %r11, %r10 +; AVX512-NEXT: vmovq %r10, %xmm0 +; AVX512-NEXT: vmovq %r9, %xmm1 +; AVX512-NEXT: vmovq %r8, %xmm2 +; AVX512-NEXT: vmovq %rdi, %xmm3 +; AVX512-NEXT: vmovq %rsi, %xmm4 +; AVX512-NEXT: vmovq %rdx, %xmm5 +; AVX512-NEXT: vmovq %rcx, %xmm6 +; AVX512-NEXT: vmovq %rax, %xmm7 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0] +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: addq $8, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq + %aext = sext <8 x i64> %a to <8 x i128> + %bext = sext <8 x i64> %b to <8 x i128> + %sub = sub <8 x i128> %aext, %bext + %abs = call <8 x i128> @llvm.abs.v8i128(<8 x i128> %sub, i1 true) + %trunc = trunc <8 x i128> %abs to <8 x i64> + ret <8 x i64> %trunc +} + +; +; sub(smax(a,b),smin(a,b)) -> abds(a,b) +; + +define <64 x i8> @abd_minmax_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { +; AVX512BW-LABEL: abd_minmax_v64i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpminsb %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: abd_minmax_v64i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpminsb %ymm1, %ymm0, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512DQ-NEXT: vpminsb %ymm3, %ymm4, %ymm5 +; AVX512DQ-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsubb %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmaxsb %ymm3, %ymm4, %ymm1 +; AVX512DQ-NEXT: vpsubb %ymm5, %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: retq + %min = call <64 x i8> @llvm.smin.v64i8(<64 x i8> %a, <64 x i8> %b) + %max = call <64 x i8> @llvm.smax.v64i8(<64 x i8> %a, <64 x i8> %b) + %sub = sub <64 x i8> %max, %min + ret <64 x i8> %sub +} + +define <32 x i16> @abd_minmax_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { +; AVX512BW-LABEL: abd_minmax_v32i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpminsw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsubw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: abd_minmax_v32i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpminsw %ymm1, %ymm0, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512DQ-NEXT: vpminsw %ymm3, %ymm4, %ymm5 +; AVX512DQ-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsubw %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmaxsw %ymm3, %ymm4, %ymm1 +; AVX512DQ-NEXT: vpsubw %ymm5, %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: retq + %min = call <32 x i16> @llvm.smin.v32i16(<32 x i16> %a, <32 x i16> %b) + %max = call <32 x i16> @llvm.smax.v32i16(<32 x i16> %a, <32 x i16> %b) + %sub = sub <32 x i16> %max, %min + ret <32 x i16> %sub +} + +define <16 x i32> @abd_minmax_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { +; AVX512-LABEL: abd_minmax_v16i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm2 +; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsubd %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: retq + %min = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %a, <16 x i32> %b) + %max = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %a, <16 x i32> %b) + %sub = sub <16 x i32> %max, %min + ret <16 x i32> %sub +} + +define <8 x i64> @abd_minmax_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { +; AVX512-LABEL: abd_minmax_v8i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm2 +; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: retq + %min = call <8 x i64> @llvm.smin.v8i64(<8 x i64> %a, <8 x i64> %b) + %max = call <8 x i64> @llvm.smax.v8i64(<8 x i64> %a, <8 x i64> %b) + %sub = sub <8 x i64> %max, %min + ret <8 x i64> %sub +} + +; +; abs(sub_nsw(x, y)) -> abds(a,b) +; + +define <64 x i8> @abd_subnsw_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { +; AVX512BW-LABEL: abd_subnsw_v64i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpabsb %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: abd_subnsw_v64i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512DQ-NEXT: vpsubb %ymm2, %ymm3, %ymm2 +; AVX512DQ-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpabsb %ymm0, %ymm0 +; AVX512DQ-NEXT: vpabsb %ymm2, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: retq + %sub = sub nsw <64 x i8> %a, %b + %abs = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %sub, i1 false) + ret <64 x i8> %abs +} + +define <32 x i16> @abd_subnsw_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { +; AVX512BW-LABEL: abd_subnsw_v32i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpabsw %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: abd_subnsw_v32i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512DQ-NEXT: vpsubw %ymm2, %ymm3, %ymm2 +; AVX512DQ-NEXT: vpsubw %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpabsw %ymm0, %ymm0 +; AVX512DQ-NEXT: vpabsw %ymm2, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: retq + %sub = sub nsw <32 x i16> %a, %b + %abs = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %sub, i1 false) + ret <32 x i16> %abs +} + +define <16 x i32> @abd_subnsw_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { +; AVX512-LABEL: abd_subnsw_v16i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsd %zmm0, %zmm0 +; AVX512-NEXT: retq + %sub = sub nsw <16 x i32> %a, %b + %abs = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %sub, i1 false) + ret <16 x i32> %abs +} + +define <8 x i64> @abd_subnsw_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { +; AVX512-LABEL: abd_subnsw_v8i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: retq + %sub = sub nsw <8 x i64> %a, %b + %abs = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %sub, i1 false) + ret <8 x i64> %abs +} + +declare <64 x i8> @llvm.abs.v64i8(<64 x i8>, i1) +declare <32 x i16> @llvm.abs.v32i16(<32 x i16>, i1) +declare <16 x i32> @llvm.abs.v16i32(<16 x i32>, i1) +declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1) +declare <16 x i64> @llvm.abs.v16i64(<16 x i64>, i1) +declare <32 x i64> @llvm.abs.v32i64(<32 x i64>, i1) +declare <64 x i64> @llvm.abs.v64i64(<64 x i64>, i1) +declare <8 x i128> @llvm.abs.v8i128(<8 x i128>, i1) + +declare <64 x i8> @llvm.smax.v64i8(<64 x i8>, <64 x i8>) +declare <32 x i16> @llvm.smax.v32i16(<32 x i16>, <32 x i16>) +declare <16 x i32> @llvm.smax.v16i32(<16 x i32>, <16 x i32>) +declare <8 x i64> @llvm.smax.v8i64(<8 x i64>, <8 x i64>) + +declare <64 x i8> @llvm.smin.v64i8(<64 x i8>, <64 x i8>) +declare <32 x i16> @llvm.smin.v32i16(<32 x i16>, <32 x i16>) +declare <16 x i32> @llvm.smin.v16i32(<16 x i32>, <16 x i32>) +declare <8 x i64> @llvm.smin.v8i64(<8 x i64>, <8 x i64>) diff --git a/llvm/test/CodeGen/X86/abdu-vector-128.ll b/llvm/test/CodeGen/X86/abdu-vector-128.ll new file mode 100644 index 0000000..ff6d3ab --- /dev/null +++ b/llvm/test/CodeGen/X86/abdu-vector-128.ll @@ -0,0 +1,1668 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42 +; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512 + +; +; trunc(abs(sub(zext(a),zext(b)))) -> abdu(a,b) +; + +define <16 x i8> @abd_ext_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { +; SSE2-LABEL: abd_ext_v16i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] +; SSE2-NEXT: movdqa %xmm2, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; SSE2-NEXT: movdqa %xmm7, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE2-NEXT: movdqa %xmm11, %xmm12 +; SSE2-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; SSE2-NEXT: psubq %xmm12, %xmm0 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE2-NEXT: psubq %xmm11, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] +; SSE2-NEXT: psubq %xmm11, %xmm5 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; SSE2-NEXT: psubq %xmm10, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] +; SSE2-NEXT: psubq %xmm11, %xmm6 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; SSE2-NEXT: psubq %xmm10, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; SSE2-NEXT: psubq %xmm10, %xmm8 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE2-NEXT: psubq %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: psubq %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: psubq %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: psubq %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: psubq %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm7, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm7 +; SSE2-NEXT: psubq %xmm1, %xmm7 +; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm8 +; SSE2-NEXT: psubq %xmm1, %xmm8 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: psubq %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm8 +; SSE2-NEXT: packuswb %xmm2, %xmm8 +; SSE2-NEXT: pand %xmm1, %xmm7 +; SSE2-NEXT: pand %xmm1, %xmm6 +; SSE2-NEXT: packuswb %xmm7, %xmm6 +; SSE2-NEXT: packuswb %xmm8, %xmm6 +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm1, %xmm5 +; SSE2-NEXT: packuswb %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm3, %xmm0 +; SSE2-NEXT: packuswb %xmm5, %xmm0 +; SSE2-NEXT: packuswb %xmm6, %xmm0 +; SSE2-NEXT: retq +; +; SSE42-LABEL: abd_ext_v16i8: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: psrld $16, %xmm2 +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm9 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: movdqa %xmm0, %xmm4 +; SSE42-NEXT: psrlq $48, %xmm4 +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: movdqa %xmm0, %xmm6 +; SSE42-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: movdqa %xmm1, %xmm0 +; SSE42-NEXT: psrld $16, %xmm0 +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: psubq %xmm0, %xmm2 +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: psubq %xmm0, %xmm9 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: psubq %xmm0, %xmm3 +; SSE42-NEXT: movdqa %xmm1, %xmm0 +; SSE42-NEXT: psrlq $48, %xmm0 +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: psubq %xmm0, %xmm5 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: psubq %xmm0, %xmm4 +; SSE42-NEXT: movdqa %xmm1, %xmm0 +; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: psubq %xmm0, %xmm6 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: psubq %xmm0, %xmm7 +; SSE42-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: psubq %xmm0, %xmm8 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pxor %xmm10, %xmm10 +; SSE42-NEXT: psubq %xmm2, %xmm10 +; SSE42-NEXT: movdqa %xmm2, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm2 +; SSE42-NEXT: pxor %xmm10, %xmm10 +; SSE42-NEXT: psubq %xmm9, %xmm10 +; SSE42-NEXT: movdqa %xmm9, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm9 +; SSE42-NEXT: pxor %xmm10, %xmm10 +; SSE42-NEXT: psubq %xmm3, %xmm10 +; SSE42-NEXT: movdqa %xmm3, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm3 +; SSE42-NEXT: pxor %xmm10, %xmm10 +; SSE42-NEXT: psubq %xmm5, %xmm10 +; SSE42-NEXT: movdqa %xmm5, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm5 +; SSE42-NEXT: pxor %xmm10, %xmm10 +; SSE42-NEXT: psubq %xmm4, %xmm10 +; SSE42-NEXT: movdqa %xmm4, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm4 +; SSE42-NEXT: pxor %xmm10, %xmm10 +; SSE42-NEXT: psubq %xmm6, %xmm10 +; SSE42-NEXT: movdqa %xmm6, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm6 +; SSE42-NEXT: pxor %xmm10, %xmm10 +; SSE42-NEXT: psubq %xmm7, %xmm10 +; SSE42-NEXT: movdqa %xmm7, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm7 +; SSE42-NEXT: psubq %xmm8, %xmm1 +; SSE42-NEXT: movdqa %xmm8, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm8 +; SSE42-NEXT: movapd {{.*#+}} xmm0 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE42-NEXT: andpd %xmm0, %xmm8 +; SSE42-NEXT: andpd %xmm0, %xmm7 +; SSE42-NEXT: packusdw %xmm8, %xmm7 +; SSE42-NEXT: andpd %xmm0, %xmm6 +; SSE42-NEXT: andpd %xmm0, %xmm4 +; SSE42-NEXT: packusdw %xmm6, %xmm4 +; SSE42-NEXT: packusdw %xmm7, %xmm4 +; SSE42-NEXT: andpd %xmm0, %xmm5 +; SSE42-NEXT: andpd %xmm0, %xmm3 +; SSE42-NEXT: packusdw %xmm5, %xmm3 +; SSE42-NEXT: andpd %xmm0, %xmm9 +; SSE42-NEXT: andpd %xmm0, %xmm2 +; SSE42-NEXT: packusdw %xmm2, %xmm9 +; SSE42-NEXT: packusdw %xmm3, %xmm9 +; SSE42-NEXT: packuswb %xmm4, %xmm9 +; SSE42-NEXT: movdqa %xmm9, %xmm0 +; SSE42-NEXT: retq +; +; AVX1-LABEL: abd_ext_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm9 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm11, %xmm4, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm11, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm11, %xmm6, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,1,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm10, %xmm3, %xmm3 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm11, %xmm8, %xmm8 +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm10, %xmm7, %xmm7 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm10 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsubq %xmm10, %xmm9, %xmm9 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm10 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm11 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm12 +; AVX1-NEXT: vpsubq %xmm5, %xmm2, %xmm5 +; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-NEXT: vblendvpd %ymm10, %ymm4, %ymm10, %ymm4 +; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm3 +; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-NEXT: vblendvpd %ymm1, %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vpsubq %xmm7, %xmm2, %xmm3 +; AVX1-NEXT: vpsubq %xmm8, %xmm2, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-NEXT: vblendvpd %ymm12, %ymm3, %ymm12, %ymm3 +; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpsubq %xmm9, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vblendvpd %ymm11, %ymm0, %ymm11, %ymm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [255,255,255,255] +; AVX1-NEXT: vandpd %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vandpd %ymm2, %ymm4, %ymm4 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vandpd %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vandpd %ymm2, %ymm3, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_ext_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; AVX2-NEXT: vpsubq %ymm6, %ymm3, %ymm3 +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX2-NEXT: vpsubq %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsubq %ymm3, %ymm2, %ymm5 +; AVX2-NEXT: vblendvpd %ymm3, %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vpsubq %ymm0, %ymm2, %ymm5 +; AVX2-NEXT: vblendvpd %ymm0, %ymm5, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm4, %ymm2, %ymm5 +; AVX2-NEXT: vblendvpd %ymm4, %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm2 +; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255] +; AVX2-NEXT: vandpd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vandpd %ymm2, %ymm4, %ymm4 +; AVX2-NEXT: vpackusdw %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vandpd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vandpd %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_ext_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm2, %zmm1 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %aext = zext <16 x i8> %a to <16 x i64> + %bext = zext <16 x i8> %b to <16 x i64> + %sub = sub <16 x i64> %aext, %bext + %abs = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %sub, i1 false) + %trunc = trunc <16 x i64> %abs to <16 x i8> + ret <16 x i8> %trunc +} + +define <16 x i8> @abd_ext_v16i8_undef(<16 x i8> %a, <16 x i8> %b) nounwind { +; SSE2-LABEL: abd_ext_v16i8_undef: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] +; SSE2-NEXT: movdqa %xmm2, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; SSE2-NEXT: movdqa %xmm7, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE2-NEXT: movdqa %xmm11, %xmm12 +; SSE2-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; SSE2-NEXT: psubq %xmm12, %xmm0 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE2-NEXT: psubq %xmm11, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] +; SSE2-NEXT: psubq %xmm11, %xmm5 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; SSE2-NEXT: psubq %xmm10, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] +; SSE2-NEXT: psubq %xmm11, %xmm6 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; SSE2-NEXT: psubq %xmm10, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; SSE2-NEXT: psubq %xmm10, %xmm8 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE2-NEXT: psubq %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: psubq %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: psubq %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: psubq %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: psubq %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm7, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm7 +; SSE2-NEXT: psubq %xmm1, %xmm7 +; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm8 +; SSE2-NEXT: psubq %xmm1, %xmm8 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: psubq %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm8 +; SSE2-NEXT: packuswb %xmm2, %xmm8 +; SSE2-NEXT: pand %xmm1, %xmm7 +; SSE2-NEXT: pand %xmm1, %xmm6 +; SSE2-NEXT: packuswb %xmm7, %xmm6 +; SSE2-NEXT: packuswb %xmm8, %xmm6 +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm1, %xmm5 +; SSE2-NEXT: packuswb %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm3, %xmm0 +; SSE2-NEXT: packuswb %xmm5, %xmm0 +; SSE2-NEXT: packuswb %xmm6, %xmm0 +; SSE2-NEXT: retq +; +; SSE42-LABEL: abd_ext_v16i8_undef: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: psrld $16, %xmm2 +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm9 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: movdqa %xmm0, %xmm4 +; SSE42-NEXT: psrlq $48, %xmm4 +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: movdqa %xmm0, %xmm6 +; SSE42-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: movdqa %xmm1, %xmm0 +; SSE42-NEXT: psrld $16, %xmm0 +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: psubq %xmm0, %xmm2 +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: psubq %xmm0, %xmm9 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: psubq %xmm0, %xmm3 +; SSE42-NEXT: movdqa %xmm1, %xmm0 +; SSE42-NEXT: psrlq $48, %xmm0 +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: psubq %xmm0, %xmm5 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: psubq %xmm0, %xmm4 +; SSE42-NEXT: movdqa %xmm1, %xmm0 +; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: psubq %xmm0, %xmm6 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: psubq %xmm0, %xmm7 +; SSE42-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: psubq %xmm0, %xmm8 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pxor %xmm10, %xmm10 +; SSE42-NEXT: psubq %xmm2, %xmm10 +; SSE42-NEXT: movdqa %xmm2, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm2 +; SSE42-NEXT: pxor %xmm10, %xmm10 +; SSE42-NEXT: psubq %xmm9, %xmm10 +; SSE42-NEXT: movdqa %xmm9, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm9 +; SSE42-NEXT: pxor %xmm10, %xmm10 +; SSE42-NEXT: psubq %xmm3, %xmm10 +; SSE42-NEXT: movdqa %xmm3, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm3 +; SSE42-NEXT: pxor %xmm10, %xmm10 +; SSE42-NEXT: psubq %xmm5, %xmm10 +; SSE42-NEXT: movdqa %xmm5, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm5 +; SSE42-NEXT: pxor %xmm10, %xmm10 +; SSE42-NEXT: psubq %xmm4, %xmm10 +; SSE42-NEXT: movdqa %xmm4, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm4 +; SSE42-NEXT: pxor %xmm10, %xmm10 +; SSE42-NEXT: psubq %xmm6, %xmm10 +; SSE42-NEXT: movdqa %xmm6, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm6 +; SSE42-NEXT: pxor %xmm10, %xmm10 +; SSE42-NEXT: psubq %xmm7, %xmm10 +; SSE42-NEXT: movdqa %xmm7, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm7 +; SSE42-NEXT: psubq %xmm8, %xmm1 +; SSE42-NEXT: movdqa %xmm8, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm8 +; SSE42-NEXT: movapd {{.*#+}} xmm0 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE42-NEXT: andpd %xmm0, %xmm8 +; SSE42-NEXT: andpd %xmm0, %xmm7 +; SSE42-NEXT: packusdw %xmm8, %xmm7 +; SSE42-NEXT: andpd %xmm0, %xmm6 +; SSE42-NEXT: andpd %xmm0, %xmm4 +; SSE42-NEXT: packusdw %xmm6, %xmm4 +; SSE42-NEXT: packusdw %xmm7, %xmm4 +; SSE42-NEXT: andpd %xmm0, %xmm5 +; SSE42-NEXT: andpd %xmm0, %xmm3 +; SSE42-NEXT: packusdw %xmm5, %xmm3 +; SSE42-NEXT: andpd %xmm0, %xmm9 +; SSE42-NEXT: andpd %xmm0, %xmm2 +; SSE42-NEXT: packusdw %xmm2, %xmm9 +; SSE42-NEXT: packusdw %xmm3, %xmm9 +; SSE42-NEXT: packuswb %xmm4, %xmm9 +; SSE42-NEXT: movdqa %xmm9, %xmm0 +; SSE42-NEXT: retq +; +; AVX1-LABEL: abd_ext_v16i8_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm9 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm11, %xmm4, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm11, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm11, %xmm6, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,1,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm10, %xmm3, %xmm3 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm11, %xmm8, %xmm8 +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm10, %xmm7, %xmm7 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm10 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsubq %xmm10, %xmm9, %xmm9 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm10 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm11 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm12 +; AVX1-NEXT: vpsubq %xmm5, %xmm2, %xmm5 +; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-NEXT: vblendvpd %ymm10, %ymm4, %ymm10, %ymm4 +; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm3 +; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-NEXT: vblendvpd %ymm1, %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vpsubq %xmm7, %xmm2, %xmm3 +; AVX1-NEXT: vpsubq %xmm8, %xmm2, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-NEXT: vblendvpd %ymm12, %ymm3, %ymm12, %ymm3 +; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpsubq %xmm9, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vblendvpd %ymm11, %ymm0, %ymm11, %ymm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [255,255,255,255] +; AVX1-NEXT: vandpd %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vandpd %ymm2, %ymm4, %ymm4 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vandpd %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vandpd %ymm2, %ymm3, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_ext_v16i8_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; AVX2-NEXT: vpsubq %ymm6, %ymm3, %ymm3 +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX2-NEXT: vpsubq %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsubq %ymm3, %ymm2, %ymm5 +; AVX2-NEXT: vblendvpd %ymm3, %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vpsubq %ymm0, %ymm2, %ymm5 +; AVX2-NEXT: vblendvpd %ymm0, %ymm5, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm4, %ymm2, %ymm5 +; AVX2-NEXT: vblendvpd %ymm4, %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm2 +; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255] +; AVX2-NEXT: vandpd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vandpd %ymm2, %ymm4, %ymm4 +; AVX2-NEXT: vpackusdw %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vandpd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vandpd %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_ext_v16i8_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm2, %zmm1 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %aext = zext <16 x i8> %a to <16 x i64> + %bext = zext <16 x i8> %b to <16 x i64> + %sub = sub <16 x i64> %aext, %bext + %abs = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %sub, i1 true) + %trunc = trunc <16 x i64> %abs to <16 x i8> + ret <16 x i8> %trunc +} + +define <8 x i16> @abd_ext_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { +; SSE2-LABEL: abd_ext_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE2-NEXT: psubq %xmm7, %xmm3 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSE2-NEXT: psubq %xmm6, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE2-NEXT: psubq %xmm6, %xmm4 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: psubq %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: psubq %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: psubq %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: retq +; +; SSE42-LABEL: abd_ext_v8i16: +; SSE42: # %bb.0: +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE42-NEXT: psubq %xmm0, %xmm2 +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE42-NEXT: psubq %xmm0, %xmm5 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE42-NEXT: psubq %xmm0, %xmm3 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE42-NEXT: psubq %xmm0, %xmm4 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pxor %xmm6, %xmm6 +; SSE42-NEXT: psubq %xmm2, %xmm6 +; SSE42-NEXT: movdqa %xmm2, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm2 +; SSE42-NEXT: pxor %xmm6, %xmm6 +; SSE42-NEXT: psubq %xmm5, %xmm6 +; SSE42-NEXT: movdqa %xmm5, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm5 +; SSE42-NEXT: pxor %xmm6, %xmm6 +; SSE42-NEXT: psubq %xmm3, %xmm6 +; SSE42-NEXT: movdqa %xmm3, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm3 +; SSE42-NEXT: pxor %xmm6, %xmm6 +; SSE42-NEXT: psubq %xmm4, %xmm6 +; SSE42-NEXT: movdqa %xmm4, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm4 +; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1,2,3],xmm4[4],xmm1[5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1,2,3],xmm3[4],xmm1[5,6,7] +; SSE42-NEXT: packusdw %xmm4, %xmm3 +; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0],xmm1[1,2,3],xmm5[4],xmm1[5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7] +; SSE42-NEXT: packusdw %xmm2, %xmm5 +; SSE42-NEXT: packusdw %xmm3, %xmm5 +; SSE42-NEXT: movdqa %xmm5, %xmm0 +; SSE42-NEXT: retq +; +; AVX1-LABEL: abd_ext_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm5 +; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vpsubq %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vpsubq %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vblendvpd %ymm5, %ymm2, %ymm5, %ymm2 +; AVX1-NEXT: vpsubq %xmm0, %xmm6, %xmm0 +; AVX1-NEXT: vpsubq %xmm4, %xmm6, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: vblendvpd %ymm1, %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [65535,65535,65535,65535] +; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vandpd %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_ext_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX2-NEXT: vpsubq %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubq %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm3 +; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_ext_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %aext = zext <8 x i16> %a to <8 x i64> + %bext = zext <8 x i16> %b to <8 x i64> + %sub = sub <8 x i64> %aext, %bext + %abs = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %sub, i1 false) + %trunc = trunc <8 x i64> %abs to <8 x i16> + ret <8 x i16> %trunc +} + +define <8 x i16> @abd_ext_v8i16_undef(<8 x i16> %a, <8 x i16> %b) nounwind { +; SSE2-LABEL: abd_ext_v8i16_undef: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE2-NEXT: psubq %xmm7, %xmm3 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSE2-NEXT: psubq %xmm6, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE2-NEXT: psubq %xmm6, %xmm4 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: psubq %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: psubq %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: psubq %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: retq +; +; SSE42-LABEL: abd_ext_v8i16_undef: +; SSE42: # %bb.0: +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE42-NEXT: psubq %xmm0, %xmm2 +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE42-NEXT: psubq %xmm0, %xmm5 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE42-NEXT: psubq %xmm0, %xmm3 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE42-NEXT: psubq %xmm0, %xmm4 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pxor %xmm6, %xmm6 +; SSE42-NEXT: psubq %xmm2, %xmm6 +; SSE42-NEXT: movdqa %xmm2, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm2 +; SSE42-NEXT: pxor %xmm6, %xmm6 +; SSE42-NEXT: psubq %xmm5, %xmm6 +; SSE42-NEXT: movdqa %xmm5, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm5 +; SSE42-NEXT: pxor %xmm6, %xmm6 +; SSE42-NEXT: psubq %xmm3, %xmm6 +; SSE42-NEXT: movdqa %xmm3, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm3 +; SSE42-NEXT: pxor %xmm6, %xmm6 +; SSE42-NEXT: psubq %xmm4, %xmm6 +; SSE42-NEXT: movdqa %xmm4, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm4 +; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1,2,3],xmm4[4],xmm1[5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1,2,3],xmm3[4],xmm1[5,6,7] +; SSE42-NEXT: packusdw %xmm4, %xmm3 +; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0],xmm1[1,2,3],xmm5[4],xmm1[5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7] +; SSE42-NEXT: packusdw %xmm2, %xmm5 +; SSE42-NEXT: packusdw %xmm3, %xmm5 +; SSE42-NEXT: movdqa %xmm5, %xmm0 +; SSE42-NEXT: retq +; +; AVX1-LABEL: abd_ext_v8i16_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm5 +; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vpsubq %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vpsubq %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vblendvpd %ymm5, %ymm2, %ymm5, %ymm2 +; AVX1-NEXT: vpsubq %xmm0, %xmm6, %xmm0 +; AVX1-NEXT: vpsubq %xmm4, %xmm6, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: vblendvpd %ymm1, %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [65535,65535,65535,65535] +; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vandpd %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_ext_v8i16_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX2-NEXT: vpsubq %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubq %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm3 +; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_ext_v8i16_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %aext = zext <8 x i16> %a to <8 x i64> + %bext = zext <8 x i16> %b to <8 x i64> + %sub = sub <8 x i64> %aext, %bext + %abs = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %sub, i1 true) + %trunc = trunc <8 x i64> %abs to <8 x i16> + ret <8 x i16> %trunc +} + +define <4 x i32> @abd_ext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { +; SSE2-LABEL: abd_ext_v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE2-NEXT: psubq %xmm4, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: psubq %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; SSE2-NEXT: retq +; +; SSE42-LABEL: abd_ext_v4i32: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: pxor %xmm4, %xmm4 +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero +; SSE42-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SSE42-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE42-NEXT: psubq %xmm1, %xmm2 +; SSE42-NEXT: psubq %xmm0, %xmm3 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: psubq %xmm2, %xmm1 +; SSE42-NEXT: movdqa %xmm2, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE42-NEXT: psubq %xmm3, %xmm4 +; SSE42-NEXT: movdqa %xmm3, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm3 +; SSE42-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] +; SSE42-NEXT: movaps %xmm3, %xmm0 +; SSE42-NEXT: retq +; +; AVX1-LABEL: abd_ext_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-NEXT: vpsubq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm3 +; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vblendvpd %ymm1, %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_ext_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm1 +; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_ext_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpabsq %ymm0, %ymm0 +; AVX512-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %aext = zext <4 x i32> %a to <4 x i64> + %bext = zext <4 x i32> %b to <4 x i64> + %sub = sub <4 x i64> %aext, %bext + %abs = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %sub, i1 false) + %trunc = trunc <4 x i64> %abs to <4 x i32> + ret <4 x i32> %trunc +} + +define <4 x i32> @abd_ext_v4i32_undef(<4 x i32> %a, <4 x i32> %b) nounwind { +; SSE2-LABEL: abd_ext_v4i32_undef: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE2-NEXT: psubq %xmm4, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: psubq %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; SSE2-NEXT: retq +; +; SSE42-LABEL: abd_ext_v4i32_undef: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: pxor %xmm4, %xmm4 +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero +; SSE42-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SSE42-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE42-NEXT: psubq %xmm1, %xmm2 +; SSE42-NEXT: psubq %xmm0, %xmm3 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: psubq %xmm2, %xmm1 +; SSE42-NEXT: movdqa %xmm2, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE42-NEXT: psubq %xmm3, %xmm4 +; SSE42-NEXT: movdqa %xmm3, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm3 +; SSE42-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] +; SSE42-NEXT: movaps %xmm3, %xmm0 +; SSE42-NEXT: retq +; +; AVX1-LABEL: abd_ext_v4i32_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-NEXT: vpsubq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm3 +; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vblendvpd %ymm1, %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_ext_v4i32_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm1 +; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_ext_v4i32_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpabsq %ymm0, %ymm0 +; AVX512-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %aext = zext <4 x i32> %a to <4 x i64> + %bext = zext <4 x i32> %b to <4 x i64> + %sub = sub <4 x i64> %aext, %bext + %abs = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %sub, i1 true) + %trunc = trunc <4 x i64> %abs to <4 x i32> + ret <4 x i32> %trunc +} + +define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { +; SSE2-LABEL: abd_ext_v2i64: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: movq %xmm0, %rcx +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE2-NEXT: movq %xmm0, %rdx +; SSE2-NEXT: movq %xmm1, %rsi +; SSE2-NEXT: xorl %edi, %edi +; SSE2-NEXT: subq %rsi, %rcx +; SSE2-NEXT: movl $0, %esi +; SSE2-NEXT: sbbq %rsi, %rsi +; SSE2-NEXT: subq %rdx, %rax +; SSE2-NEXT: sbbq %rdi, %rdi +; SSE2-NEXT: sarq $63, %rdi +; SSE2-NEXT: xorq %rdi, %rax +; SSE2-NEXT: subq %rdi, %rax +; SSE2-NEXT: sarq $63, %rsi +; SSE2-NEXT: xorq %rsi, %rcx +; SSE2-NEXT: subq %rsi, %rcx +; SSE2-NEXT: movq %rcx, %xmm0 +; SSE2-NEXT: movq %rax, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSE42-LABEL: abd_ext_v2i64: +; SSE42: # %bb.0: +; SSE42-NEXT: movq %xmm0, %rax +; SSE42-NEXT: pextrq $1, %xmm0, %rcx +; SSE42-NEXT: movq %xmm1, %rdx +; SSE42-NEXT: pextrq $1, %xmm1, %rsi +; SSE42-NEXT: xorl %edi, %edi +; SSE42-NEXT: subq %rsi, %rcx +; SSE42-NEXT: movl $0, %esi +; SSE42-NEXT: sbbq %rsi, %rsi +; SSE42-NEXT: subq %rdx, %rax +; SSE42-NEXT: sbbq %rdi, %rdi +; SSE42-NEXT: sarq $63, %rdi +; SSE42-NEXT: xorq %rdi, %rax +; SSE42-NEXT: subq %rdi, %rax +; SSE42-NEXT: sarq $63, %rsi +; SSE42-NEXT: xorq %rsi, %rcx +; SSE42-NEXT: subq %rsi, %rcx +; SSE42-NEXT: movq %rcx, %xmm1 +; SSE42-NEXT: movq %rax, %xmm0 +; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE42-NEXT: retq +; +; AVX-LABEL: abd_ext_v2i64: +; AVX: # %bb.0: +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vpextrq $1, %xmm0, %rcx +; AVX-NEXT: vmovq %xmm1, %rdx +; AVX-NEXT: vpextrq $1, %xmm1, %rsi +; AVX-NEXT: xorl %edi, %edi +; AVX-NEXT: subq %rsi, %rcx +; AVX-NEXT: movl $0, %esi +; AVX-NEXT: sbbq %rsi, %rsi +; AVX-NEXT: subq %rdx, %rax +; AVX-NEXT: sbbq %rdi, %rdi +; AVX-NEXT: sarq $63, %rdi +; AVX-NEXT: xorq %rdi, %rax +; AVX-NEXT: subq %rdi, %rax +; AVX-NEXT: sarq $63, %rsi +; AVX-NEXT: xorq %rsi, %rcx +; AVX-NEXT: subq %rsi, %rcx +; AVX-NEXT: vmovq %rcx, %xmm0 +; AVX-NEXT: vmovq %rax, %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: retq + %aext = zext <2 x i64> %a to <2 x i128> + %bext = zext <2 x i64> %b to <2 x i128> + %sub = sub <2 x i128> %aext, %bext + %abs = call <2 x i128> @llvm.abs.v2i128(<2 x i128> %sub, i1 false) + %trunc = trunc <2 x i128> %abs to <2 x i64> + ret <2 x i64> %trunc +} + +define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind { +; SSE2-LABEL: abd_ext_v2i64_undef: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: movq %xmm0, %rcx +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE2-NEXT: movq %xmm0, %rdx +; SSE2-NEXT: movq %xmm1, %rsi +; SSE2-NEXT: xorl %edi, %edi +; SSE2-NEXT: subq %rsi, %rcx +; SSE2-NEXT: movl $0, %esi +; SSE2-NEXT: sbbq %rsi, %rsi +; SSE2-NEXT: subq %rdx, %rax +; SSE2-NEXT: sbbq %rdi, %rdi +; SSE2-NEXT: sarq $63, %rdi +; SSE2-NEXT: xorq %rdi, %rax +; SSE2-NEXT: subq %rdi, %rax +; SSE2-NEXT: sarq $63, %rsi +; SSE2-NEXT: xorq %rsi, %rcx +; SSE2-NEXT: subq %rsi, %rcx +; SSE2-NEXT: movq %rcx, %xmm0 +; SSE2-NEXT: movq %rax, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSE42-LABEL: abd_ext_v2i64_undef: +; SSE42: # %bb.0: +; SSE42-NEXT: movq %xmm0, %rax +; SSE42-NEXT: pextrq $1, %xmm0, %rcx +; SSE42-NEXT: movq %xmm1, %rdx +; SSE42-NEXT: pextrq $1, %xmm1, %rsi +; SSE42-NEXT: xorl %edi, %edi +; SSE42-NEXT: subq %rsi, %rcx +; SSE42-NEXT: movl $0, %esi +; SSE42-NEXT: sbbq %rsi, %rsi +; SSE42-NEXT: subq %rdx, %rax +; SSE42-NEXT: sbbq %rdi, %rdi +; SSE42-NEXT: sarq $63, %rdi +; SSE42-NEXT: xorq %rdi, %rax +; SSE42-NEXT: subq %rdi, %rax +; SSE42-NEXT: sarq $63, %rsi +; SSE42-NEXT: xorq %rsi, %rcx +; SSE42-NEXT: subq %rsi, %rcx +; SSE42-NEXT: movq %rcx, %xmm1 +; SSE42-NEXT: movq %rax, %xmm0 +; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE42-NEXT: retq +; +; AVX-LABEL: abd_ext_v2i64_undef: +; AVX: # %bb.0: +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vpextrq $1, %xmm0, %rcx +; AVX-NEXT: vmovq %xmm1, %rdx +; AVX-NEXT: vpextrq $1, %xmm1, %rsi +; AVX-NEXT: xorl %edi, %edi +; AVX-NEXT: subq %rsi, %rcx +; AVX-NEXT: movl $0, %esi +; AVX-NEXT: sbbq %rsi, %rsi +; AVX-NEXT: subq %rdx, %rax +; AVX-NEXT: sbbq %rdi, %rdi +; AVX-NEXT: sarq $63, %rdi +; AVX-NEXT: xorq %rdi, %rax +; AVX-NEXT: subq %rdi, %rax +; AVX-NEXT: sarq $63, %rsi +; AVX-NEXT: xorq %rsi, %rcx +; AVX-NEXT: subq %rsi, %rcx +; AVX-NEXT: vmovq %rcx, %xmm0 +; AVX-NEXT: vmovq %rax, %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: retq + %aext = zext <2 x i64> %a to <2 x i128> + %bext = zext <2 x i64> %b to <2 x i128> + %sub = sub <2 x i128> %aext, %bext + %abs = call <2 x i128> @llvm.abs.v2i128(<2 x i128> %sub, i1 true) + %trunc = trunc <2 x i128> %abs to <2 x i64> + ret <2 x i64> %trunc +} + +; +; sub(umax(a,b),umin(a,b)) -> abdu(a,b) +; + +define <16 x i8> @abd_minmax_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { +; SSE-LABEL: abd_minmax_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pminub %xmm1, %xmm2 +; SSE-NEXT: pmaxub %xmm1, %xmm0 +; SSE-NEXT: psubb %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: abd_minmax_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsubb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq + %min = call <16 x i8> @llvm.umin.v16i8(<16 x i8> %a, <16 x i8> %b) + %max = call <16 x i8> @llvm.umax.v16i8(<16 x i8> %a, <16 x i8> %b) + %sub = sub <16 x i8> %max, %min + ret <16 x i8> %sub +} + +define <8 x i16> @abd_minmax_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { +; SSE2-LABEL: abd_minmax_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psubusw %xmm1, %xmm2 +; SSE2-NEXT: psubusw %xmm0, %xmm1 +; SSE2-NEXT: psubw %xmm0, %xmm2 +; SSE2-NEXT: paddw %xmm1, %xmm2 +; SSE2-NEXT: paddw %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE42-LABEL: abd_minmax_v8i16: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: pminuw %xmm1, %xmm2 +; SSE42-NEXT: pmaxuw %xmm1, %xmm0 +; SSE42-NEXT: psubw %xmm2, %xmm0 +; SSE42-NEXT: retq +; +; AVX-LABEL: abd_minmax_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsubw %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq + %min = call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a, <8 x i16> %b) + %max = call <8 x i16> @llvm.umax.v8i16(<8 x i16> %a, <8 x i16> %b) + %sub = sub <8 x i16> %max, %min + ret <8 x i16> %sub +} + +define <4 x i32> @abd_minmax_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { +; SSE2-LABEL: abd_minmax_v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: psubd %xmm4, %xmm0 +; SSE2-NEXT: retq +; +; SSE42-LABEL: abd_minmax_v4i32: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: pminud %xmm1, %xmm2 +; SSE42-NEXT: pmaxud %xmm1, %xmm0 +; SSE42-NEXT: psubd %xmm2, %xmm0 +; SSE42-NEXT: retq +; +; AVX-LABEL: abd_minmax_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsubd %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq + %min = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %a, <4 x i32> %b) + %max = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %a, <4 x i32> %b) + %sub = sub <4 x i32> %max, %min + ret <4 x i32> %sub +} + +define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { +; SSE2-LABEL: abd_minmax_v2i64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: psubq %xmm4, %xmm0 +; SSE2-NEXT: retq +; +; SSE42-LABEL: abd_minmax_v2i64: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; SSE42-NEXT: movdqa %xmm0, %xmm3 +; SSE42-NEXT: pxor %xmm4, %xmm3 +; SSE42-NEXT: pxor %xmm1, %xmm4 +; SSE42-NEXT: movdqa %xmm4, %xmm0 +; SSE42-NEXT: pcmpgtq %xmm3, %xmm0 +; SSE42-NEXT: movdqa %xmm1, %xmm5 +; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm5 +; SSE42-NEXT: pcmpgtq %xmm4, %xmm3 +; SSE42-NEXT: movdqa %xmm3, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE42-NEXT: psubq %xmm5, %xmm1 +; SSE42-NEXT: movdqa %xmm1, %xmm0 +; SSE42-NEXT: retq +; +; AVX1-LABEL: abd_minmax_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_minmax_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4 +; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4 +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsubq %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_minmax_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: retq + %min = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %a, <2 x i64> %b) + %max = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %a, <2 x i64> %b) + %sub = sub <2 x i64> %max, %min + ret <2 x i64> %sub +} + +declare <16 x i8> @llvm.abs.v16i8(<16 x i8>, i1) +declare <8 x i16> @llvm.abs.v8i16(<8 x i16>, i1) +declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1) +declare <2 x i64> @llvm.abs.v2i64(<2 x i64>, i1) +declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1) +declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1) +declare <16 x i64> @llvm.abs.v16i64(<16 x i64>, i1) +declare <2 x i128> @llvm.abs.v2i128(<2 x i128>, i1) + +declare <16 x i8> @llvm.umax.v16i8(<16 x i8>, <16 x i8>) +declare <8 x i16> @llvm.umax.v8i16(<8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>) +declare <2 x i64> @llvm.umax.v2i64(<2 x i64>, <2 x i64>) + +declare <16 x i8> @llvm.umin.v16i8(<16 x i8>, <16 x i8>) +declare <8 x i16> @llvm.umin.v8i16(<8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.umin.v4i32(<4 x i32>, <4 x i32>) +declare <2 x i64> @llvm.umin.v2i64(<2 x i64>, <2 x i64>) diff --git a/llvm/test/CodeGen/X86/abdu-vector-256.ll b/llvm/test/CodeGen/X86/abdu-vector-256.ll new file mode 100644 index 0000000..fa9ccd0 --- /dev/null +++ b/llvm/test/CodeGen/X86/abdu-vector-256.ll @@ -0,0 +1,1614 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX1 +; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 +; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512 + +; +; trunc(abs(sub(zext(a),zext(b)))) -> abdu(a,b) +; + +define <32 x i8> @abd_ext_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { +; AVX1-LABEL: abd_ext_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rax +; AVX1-NEXT: vmovdqa %ymm0, %ymm14 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm3 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero,xmm14[4],zero,xmm14[5],zero,xmm14[6],zero,xmm14[7],zero +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm13, %xmm5, %xmm5 +; AVX1-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm5, %xmm9, %xmm9 +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm5, %xmm12, %xmm5 +; AVX1-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm0, %xmm8, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm12, %xmm7, %xmm7 +; AVX1-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm12, %xmm11, %xmm7 +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm11, %xmm6, %xmm6 +; AVX1-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm5, %xmm10, %xmm12 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm10, %xmm11, %xmm11 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm14[0],zero,zero,zero,zero,zero,zero,zero,xmm14[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm10 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsubq %xmm10, %xmm5, %xmm10 +; AVX1-NEXT: vpsrld $16, %xmm14, %xmm5 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,zero,zero,zero,zero,xmm5[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm15 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm6, %xmm14, %xmm14 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm13 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsubq %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpsrld $16, %xmm3, %xmm3 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsubq %xmm0, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 +; AVX1-NEXT: vpsubq %xmm9, %xmm2, %xmm6 +; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm9 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm9, %ymm6 +; AVX1-NEXT: vblendvpd %ymm1, %ymm6, %ymm1, %ymm0 +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm6 +; AVX1-NEXT: vpsubq %xmm8, %xmm2, %xmm8 +; AVX1-NEXT: vpsubq %xmm9, %xmm2, %xmm9 +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 +; AVX1-NEXT: vblendvpd %ymm6, %ymm8, %ymm6, %ymm8 +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm6 +; AVX1-NEXT: vpsubq %xmm7, %xmm2, %xmm9 +; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm7 +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 +; AVX1-NEXT: vblendvpd %ymm6, %ymm7, %ymm6, %ymm7 +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm6 +; AVX1-NEXT: vpsubq %xmm12, %xmm2, %xmm9 +; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm12 +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm9 +; AVX1-NEXT: vblendvpd %ymm6, %ymm9, %ymm6, %ymm9 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm6 +; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpsubq %xmm11, %xmm2, %xmm11 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm4 +; AVX1-NEXT: vblendvpd %ymm6, %ymm4, %ymm6, %ymm4 +; AVX1-NEXT: vinsertf128 $1, %xmm15, %ymm10, %ymm6 +; AVX1-NEXT: vpsubq %xmm15, %xmm2, %xmm11 +; AVX1-NEXT: vpsubq %xmm10, %xmm2, %xmm10 +; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 +; AVX1-NEXT: vblendvpd %ymm6, %ymm10, %ymm6, %ymm6 +; AVX1-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm10 +; AVX1-NEXT: vpsubq %xmm13, %xmm2, %xmm1 +; AVX1-NEXT: vpsubq %xmm14, %xmm2, %xmm11 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm11, %ymm1 +; AVX1-NEXT: vblendvpd %ymm10, %ymm1, %ymm10, %ymm10 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm1 +; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm3 +; AVX1-NEXT: vpsubq %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm1, %ymm2 +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [255,255,255,255] +; AVX1-NEXT: vandpd %ymm1, %ymm9, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %ymm1, %ymm7, %ymm5 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm7 +; AVX1-NEXT: vpackusdw %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %ymm1, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %ymm1, %ymm10, %ymm5 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm7 +; AVX1-NEXT: vpackusdw %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %ymm1, %ymm8, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vandpd %ymm1, %ymm6, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %ymm1, %ymm4, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: popq %rax +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_ext_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero +; AVX2-NEXT: vpsubq %ymm10, %ymm3, %ymm3 +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm10 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsubq %ymm10, %ymm4, %ymm4 +; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero +; AVX2-NEXT: vpsubq %ymm10, %ymm5, %ymm5 +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero +; AVX2-NEXT: vpsubq %ymm9, %ymm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero +; AVX2-NEXT: vpsubq %ymm10, %ymm7, %ymm7 +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero +; AVX2-NEXT: vpsubq %ymm9, %ymm8, %ymm8 +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpsubq %ymm1, %ymm6, %ymm1 +; AVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX2-NEXT: vpsubq %ymm3, %ymm6, %ymm9 +; AVX2-NEXT: vblendvpd %ymm3, %ymm9, %ymm3, %ymm3 +; AVX2-NEXT: vpsubq %ymm4, %ymm6, %ymm9 +; AVX2-NEXT: vblendvpd %ymm4, %ymm9, %ymm4, %ymm4 +; AVX2-NEXT: vpsubq %ymm5, %ymm6, %ymm9 +; AVX2-NEXT: vblendvpd %ymm5, %ymm9, %ymm5, %ymm5 +; AVX2-NEXT: vpsubq %ymm2, %ymm6, %ymm9 +; AVX2-NEXT: vblendvpd %ymm2, %ymm9, %ymm2, %ymm2 +; AVX2-NEXT: vpsubq %ymm7, %ymm6, %ymm9 +; AVX2-NEXT: vblendvpd %ymm7, %ymm9, %ymm7, %ymm7 +; AVX2-NEXT: vpsubq %ymm0, %ymm6, %ymm9 +; AVX2-NEXT: vblendvpd %ymm0, %ymm9, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm8, %ymm6, %ymm9 +; AVX2-NEXT: vblendvpd %ymm8, %ymm9, %ymm8, %ymm8 +; AVX2-NEXT: vpsubq %ymm1, %ymm6, %ymm6 +; AVX2-NEXT: vblendvpd %ymm1, %ymm6, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [255,255,255,255] +; AVX2-NEXT: vandpd %ymm6, %ymm1, %ymm1 +; AVX2-NEXT: vandpd %ymm6, %ymm8, %ymm8 +; AVX2-NEXT: vpackusdw %ymm8, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vandpd %ymm6, %ymm0, %ymm0 +; AVX2-NEXT: vandpd %ymm6, %ymm7, %ymm7 +; AVX2-NEXT: vpackusdw %ymm7, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vandpd %ymm6, %ymm2, %ymm1 +; AVX2-NEXT: vandpd %ymm6, %ymm5, %ymm2 +; AVX2-NEXT: vpackusdw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vandpd %ymm6, %ymm4, %ymm2 +; AVX2-NEXT: vandpd %ymm6, %ymm3, %ymm3 +; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_ext_v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm6 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm6, %zmm3, %zmm3 +; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm5, %zmm2, %zmm2 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm5, %zmm4, %zmm4 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm3, %zmm1 +; AVX512-NEXT: vpabsq %zmm2, %zmm2 +; AVX512-NEXT: vpabsq %zmm4, %zmm3 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpmovqb %zmm3, %xmm3 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX512-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: retq + %aext = zext <32 x i8> %a to <32 x i64> + %bext = zext <32 x i8> %b to <32 x i64> + %sub = sub <32 x i64> %aext, %bext + %abs = call <32 x i64> @llvm.abs.v32i64(<32 x i64> %sub, i1 false) + %trunc = trunc <32 x i64> %abs to <32 x i8> + ret <32 x i8> %trunc +} + +define <32 x i8> @abd_ext_v32i8_undef(<32 x i8> %a, <32 x i8> %b) nounwind { +; AVX1-LABEL: abd_ext_v32i8_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rax +; AVX1-NEXT: vmovdqa %ymm0, %ymm14 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm3 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero,xmm14[4],zero,xmm14[5],zero,xmm14[6],zero,xmm14[7],zero +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm13, %xmm5, %xmm5 +; AVX1-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm5, %xmm9, %xmm9 +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm5, %xmm12, %xmm5 +; AVX1-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm0, %xmm8, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm12, %xmm7, %xmm7 +; AVX1-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm12, %xmm11, %xmm7 +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm11, %xmm6, %xmm6 +; AVX1-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm5, %xmm10, %xmm12 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm10, %xmm11, %xmm11 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm14[0],zero,zero,zero,zero,zero,zero,zero,xmm14[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm10 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsubq %xmm10, %xmm5, %xmm10 +; AVX1-NEXT: vpsrld $16, %xmm14, %xmm5 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,zero,zero,zero,zero,xmm5[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm15 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm6, %xmm14, %xmm14 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm13 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsubq %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpsrld $16, %xmm3, %xmm3 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsubq %xmm0, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 +; AVX1-NEXT: vpsubq %xmm9, %xmm2, %xmm6 +; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm9 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm9, %ymm6 +; AVX1-NEXT: vblendvpd %ymm1, %ymm6, %ymm1, %ymm0 +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm6 +; AVX1-NEXT: vpsubq %xmm8, %xmm2, %xmm8 +; AVX1-NEXT: vpsubq %xmm9, %xmm2, %xmm9 +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 +; AVX1-NEXT: vblendvpd %ymm6, %ymm8, %ymm6, %ymm8 +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm6 +; AVX1-NEXT: vpsubq %xmm7, %xmm2, %xmm9 +; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm7 +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 +; AVX1-NEXT: vblendvpd %ymm6, %ymm7, %ymm6, %ymm7 +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm6 +; AVX1-NEXT: vpsubq %xmm12, %xmm2, %xmm9 +; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm12 +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm9 +; AVX1-NEXT: vblendvpd %ymm6, %ymm9, %ymm6, %ymm9 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm6 +; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpsubq %xmm11, %xmm2, %xmm11 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm4 +; AVX1-NEXT: vblendvpd %ymm6, %ymm4, %ymm6, %ymm4 +; AVX1-NEXT: vinsertf128 $1, %xmm15, %ymm10, %ymm6 +; AVX1-NEXT: vpsubq %xmm15, %xmm2, %xmm11 +; AVX1-NEXT: vpsubq %xmm10, %xmm2, %xmm10 +; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 +; AVX1-NEXT: vblendvpd %ymm6, %ymm10, %ymm6, %ymm6 +; AVX1-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm10 +; AVX1-NEXT: vpsubq %xmm13, %xmm2, %xmm1 +; AVX1-NEXT: vpsubq %xmm14, %xmm2, %xmm11 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm11, %ymm1 +; AVX1-NEXT: vblendvpd %ymm10, %ymm1, %ymm10, %ymm10 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm1 +; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm3 +; AVX1-NEXT: vpsubq %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm1, %ymm2 +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [255,255,255,255] +; AVX1-NEXT: vandpd %ymm1, %ymm9, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %ymm1, %ymm7, %ymm5 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm7 +; AVX1-NEXT: vpackusdw %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %ymm1, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %ymm1, %ymm10, %ymm5 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm7 +; AVX1-NEXT: vpackusdw %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %ymm1, %ymm8, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vandpd %ymm1, %ymm6, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %ymm1, %ymm4, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: popq %rax +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_ext_v32i8_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero +; AVX2-NEXT: vpsubq %ymm10, %ymm3, %ymm3 +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm10 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsubq %ymm10, %ymm4, %ymm4 +; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero +; AVX2-NEXT: vpsubq %ymm10, %ymm5, %ymm5 +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero +; AVX2-NEXT: vpsubq %ymm9, %ymm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero +; AVX2-NEXT: vpsubq %ymm10, %ymm7, %ymm7 +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero +; AVX2-NEXT: vpsubq %ymm9, %ymm8, %ymm8 +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpsubq %ymm1, %ymm6, %ymm1 +; AVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX2-NEXT: vpsubq %ymm3, %ymm6, %ymm9 +; AVX2-NEXT: vblendvpd %ymm3, %ymm9, %ymm3, %ymm3 +; AVX2-NEXT: vpsubq %ymm4, %ymm6, %ymm9 +; AVX2-NEXT: vblendvpd %ymm4, %ymm9, %ymm4, %ymm4 +; AVX2-NEXT: vpsubq %ymm5, %ymm6, %ymm9 +; AVX2-NEXT: vblendvpd %ymm5, %ymm9, %ymm5, %ymm5 +; AVX2-NEXT: vpsubq %ymm2, %ymm6, %ymm9 +; AVX2-NEXT: vblendvpd %ymm2, %ymm9, %ymm2, %ymm2 +; AVX2-NEXT: vpsubq %ymm7, %ymm6, %ymm9 +; AVX2-NEXT: vblendvpd %ymm7, %ymm9, %ymm7, %ymm7 +; AVX2-NEXT: vpsubq %ymm0, %ymm6, %ymm9 +; AVX2-NEXT: vblendvpd %ymm0, %ymm9, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm8, %ymm6, %ymm9 +; AVX2-NEXT: vblendvpd %ymm8, %ymm9, %ymm8, %ymm8 +; AVX2-NEXT: vpsubq %ymm1, %ymm6, %ymm6 +; AVX2-NEXT: vblendvpd %ymm1, %ymm6, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [255,255,255,255] +; AVX2-NEXT: vandpd %ymm6, %ymm1, %ymm1 +; AVX2-NEXT: vandpd %ymm6, %ymm8, %ymm8 +; AVX2-NEXT: vpackusdw %ymm8, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vandpd %ymm6, %ymm0, %ymm0 +; AVX2-NEXT: vandpd %ymm6, %ymm7, %ymm7 +; AVX2-NEXT: vpackusdw %ymm7, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vandpd %ymm6, %ymm2, %ymm1 +; AVX2-NEXT: vandpd %ymm6, %ymm5, %ymm2 +; AVX2-NEXT: vpackusdw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vandpd %ymm6, %ymm4, %ymm2 +; AVX2-NEXT: vandpd %ymm6, %ymm3, %ymm3 +; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_ext_v32i8_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm6 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm6, %zmm3, %zmm3 +; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm5, %zmm2, %zmm2 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm5, %zmm4, %zmm4 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm3, %zmm1 +; AVX512-NEXT: vpabsq %zmm2, %zmm2 +; AVX512-NEXT: vpabsq %zmm4, %zmm3 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpmovqb %zmm3, %xmm3 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX512-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: retq + %aext = zext <32 x i8> %a to <32 x i64> + %bext = zext <32 x i8> %b to <32 x i64> + %sub = sub <32 x i64> %aext, %bext + %abs = call <32 x i64> @llvm.abs.v32i64(<32 x i64> %sub, i1 true) + %trunc = trunc <32 x i64> %abs to <32 x i8> + ret <32 x i8> %trunc +} + +define <16 x i16> @abd_ext_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { +; AVX1-LABEL: abd_ext_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm9, %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm9, %xmm3, %xmm3 +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm9, %xmm4, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[1,1,1,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm9, %xmm5, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm9, %xmm6, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm9, %xmm7, %xmm7 +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm9, %xmm8, %xmm8 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm9 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm10 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm11 +; AVX1-NEXT: vpxor %xmm12, %xmm12, %xmm12 +; AVX1-NEXT: vpsubq %xmm3, %xmm12, %xmm3 +; AVX1-NEXT: vpsubq %xmm2, %xmm12, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vblendvpd %ymm11, %ymm2, %ymm11, %ymm2 +; AVX1-NEXT: vpsubq %xmm5, %xmm12, %xmm3 +; AVX1-NEXT: vpsubq %xmm4, %xmm12, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vblendvpd %ymm10, %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vpsubq %xmm7, %xmm12, %xmm4 +; AVX1-NEXT: vpsubq %xmm6, %xmm12, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-NEXT: vblendvpd %ymm9, %ymm4, %ymm9, %ymm4 +; AVX1-NEXT: vpsubq %xmm0, %xmm12, %xmm0 +; AVX1-NEXT: vpsubq %xmm8, %xmm12, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-NEXT: vblendvpd %ymm1, %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [65535,65535,65535,65535] +; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vandpd %ymm1, %ymm4, %ymm4 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vandpd %ymm1, %ymm3, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_ext_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX2-NEXT: vpsubq %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpsubq %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX2-NEXT: vpsubq %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubq %ymm2, %ymm1, %ymm5 +; AVX2-NEXT: vblendvpd %ymm2, %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm5 +; AVX2-NEXT: vblendvpd %ymm3, %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vpsubq %ymm4, %ymm1, %ymm5 +; AVX2-NEXT: vblendvpd %ymm4, %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm5 +; AVX2-NEXT: vblendvpd %ymm0, %ymm5, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm1[1,2,3],ymm4[4],ymm1[5,6,7],ymm4[8],ymm1[9,10,11],ymm4[12],ymm1[13,14,15] +; AVX2-NEXT: vpackusdw %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6,7],ymm3[8],ymm1[9,10,11],ymm3[12],ymm1[13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] +; AVX2-NEXT: vpackusdw %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_ext_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm2, %zmm1 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpmovqw %zmm1, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq + %aext = zext <16 x i16> %a to <16 x i64> + %bext = zext <16 x i16> %b to <16 x i64> + %sub = sub <16 x i64> %aext, %bext + %abs = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %sub, i1 false) + %trunc = trunc <16 x i64> %abs to <16 x i16> + ret <16 x i16> %trunc +} + +define <16 x i16> @abd_ext_v16i16_undef(<16 x i16> %a, <16 x i16> %b) nounwind { +; AVX1-LABEL: abd_ext_v16i16_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm9, %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm9, %xmm3, %xmm3 +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm9, %xmm4, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[1,1,1,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm9, %xmm5, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm9, %xmm6, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[3,3,3,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm9, %xmm7, %xmm7 +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm9, %xmm8, %xmm8 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm9 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm10 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm11 +; AVX1-NEXT: vpxor %xmm12, %xmm12, %xmm12 +; AVX1-NEXT: vpsubq %xmm3, %xmm12, %xmm3 +; AVX1-NEXT: vpsubq %xmm2, %xmm12, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vblendvpd %ymm11, %ymm2, %ymm11, %ymm2 +; AVX1-NEXT: vpsubq %xmm5, %xmm12, %xmm3 +; AVX1-NEXT: vpsubq %xmm4, %xmm12, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vblendvpd %ymm10, %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vpsubq %xmm7, %xmm12, %xmm4 +; AVX1-NEXT: vpsubq %xmm6, %xmm12, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-NEXT: vblendvpd %ymm9, %ymm4, %ymm9, %ymm4 +; AVX1-NEXT: vpsubq %xmm0, %xmm12, %xmm0 +; AVX1-NEXT: vpsubq %xmm8, %xmm12, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-NEXT: vblendvpd %ymm1, %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [65535,65535,65535,65535] +; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vandpd %ymm1, %ymm4, %ymm4 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vandpd %ymm1, %ymm3, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_ext_v16i16_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX2-NEXT: vpsubq %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpsubq %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX2-NEXT: vpsubq %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubq %ymm2, %ymm1, %ymm5 +; AVX2-NEXT: vblendvpd %ymm2, %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm5 +; AVX2-NEXT: vblendvpd %ymm3, %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vpsubq %ymm4, %ymm1, %ymm5 +; AVX2-NEXT: vblendvpd %ymm4, %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm5 +; AVX2-NEXT: vblendvpd %ymm0, %ymm5, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm1[1,2,3],ymm4[4],ymm1[5,6,7],ymm4[8],ymm1[9,10,11],ymm4[12],ymm1[13,14,15] +; AVX2-NEXT: vpackusdw %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6,7],ymm3[8],ymm1[9,10,11],ymm3[12],ymm1[13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] +; AVX2-NEXT: vpackusdw %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_ext_v16i16_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm2, %zmm1 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpmovqw %zmm1, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq + %aext = zext <16 x i16> %a to <16 x i64> + %bext = zext <16 x i16> %b to <16 x i64> + %sub = sub <16 x i64> %aext, %bext + %abs = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %sub, i1 true) + %trunc = trunc <16 x i64> %abs to <16 x i16> + ret <16 x i16> %trunc +} + +define <8 x i32> @abd_ext_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { +; AVX1-LABEL: abd_ext_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX1-NEXT: vpsubq %xmm7, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsubq %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero +; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm6 +; AVX1-NEXT: vpsubq %xmm4, %xmm3, %xmm4 +; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vblendvpd %ymm6, %ymm2, %ymm6, %ymm2 +; AVX1-NEXT: vpsubq %xmm5, %xmm3, %xmm4 +; AVX1-NEXT: vpsubq %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vblendvpd %ymm1, %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm2[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_ext_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX2-NEXT: vpsubq %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubq %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm1 +; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm2[2,3] +; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_ext_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: retq + %aext = zext <8 x i32> %a to <8 x i64> + %bext = zext <8 x i32> %b to <8 x i64> + %sub = sub <8 x i64> %aext, %bext + %abs = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %sub, i1 false) + %trunc = trunc <8 x i64> %abs to <8 x i32> + ret <8 x i32> %trunc +} + +define <8 x i32> @abd_ext_v8i32_undef(<8 x i32> %a, <8 x i32> %b) nounwind { +; AVX1-LABEL: abd_ext_v8i32_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX1-NEXT: vpsubq %xmm7, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsubq %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero +; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm6 +; AVX1-NEXT: vpsubq %xmm4, %xmm3, %xmm4 +; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vblendvpd %ymm6, %ymm2, %ymm6, %ymm2 +; AVX1-NEXT: vpsubq %xmm5, %xmm3, %xmm4 +; AVX1-NEXT: vpsubq %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vblendvpd %ymm1, %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm2[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_ext_v8i32_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX2-NEXT: vpsubq %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubq %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm1 +; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm2[2,3] +; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_ext_v8i32_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: retq + %aext = zext <8 x i32> %a to <8 x i64> + %bext = zext <8 x i32> %b to <8 x i64> + %sub = sub <8 x i64> %aext, %bext + %abs = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %sub, i1 true) + %trunc = trunc <8 x i64> %abs to <8 x i32> + ret <8 x i32> %trunc +} + +define <4 x i64> @abd_ext_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { +; AVX1-LABEL: abd_ext_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rdx +; AVX1-NEXT: vpextrq $1, %xmm0, %rsi +; AVX1-NEXT: vmovq %xmm1, %r8 +; AVX1-NEXT: vpextrq $1, %xmm1, %r9 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %r10 +; AVX1-NEXT: vpextrq $1, %xmm0, %rdi +; AVX1-NEXT: xorl %r11d, %r11d +; AVX1-NEXT: subq %rdi, %rsi +; AVX1-NEXT: movl $0, %edi +; AVX1-NEXT: sbbq %rdi, %rdi +; AVX1-NEXT: subq %r10, %rdx +; AVX1-NEXT: movl $0, %r10d +; AVX1-NEXT: sbbq %r10, %r10 +; AVX1-NEXT: subq %r9, %rcx +; AVX1-NEXT: movl $0, %r9d +; AVX1-NEXT: sbbq %r9, %r9 +; AVX1-NEXT: subq %r8, %rax +; AVX1-NEXT: sbbq %r11, %r11 +; AVX1-NEXT: sarq $63, %r11 +; AVX1-NEXT: xorq %r11, %rax +; AVX1-NEXT: subq %r11, %rax +; AVX1-NEXT: sarq $63, %r9 +; AVX1-NEXT: xorq %r9, %rcx +; AVX1-NEXT: subq %r9, %rcx +; AVX1-NEXT: sarq $63, %r10 +; AVX1-NEXT: xorq %r10, %rdx +; AVX1-NEXT: subq %r10, %rdx +; AVX1-NEXT: sarq $63, %rdi +; AVX1-NEXT: xorq %rdi, %rsi +; AVX1-NEXT: subq %rdi, %rsi +; AVX1-NEXT: vmovq %rsi, %xmm0 +; AVX1-NEXT: vmovq %rdx, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovq %rcx, %xmm1 +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_ext_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rdx +; AVX2-NEXT: vpextrq $1, %xmm0, %rsi +; AVX2-NEXT: vmovq %xmm1, %r8 +; AVX2-NEXT: vpextrq $1, %xmm1, %r9 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %r10 +; AVX2-NEXT: vpextrq $1, %xmm0, %rdi +; AVX2-NEXT: xorl %r11d, %r11d +; AVX2-NEXT: subq %rdi, %rsi +; AVX2-NEXT: movl $0, %edi +; AVX2-NEXT: sbbq %rdi, %rdi +; AVX2-NEXT: subq %r10, %rdx +; AVX2-NEXT: movl $0, %r10d +; AVX2-NEXT: sbbq %r10, %r10 +; AVX2-NEXT: subq %r9, %rcx +; AVX2-NEXT: movl $0, %r9d +; AVX2-NEXT: sbbq %r9, %r9 +; AVX2-NEXT: subq %r8, %rax +; AVX2-NEXT: sbbq %r11, %r11 +; AVX2-NEXT: sarq $63, %r11 +; AVX2-NEXT: xorq %r11, %rax +; AVX2-NEXT: subq %r11, %rax +; AVX2-NEXT: sarq $63, %r9 +; AVX2-NEXT: xorq %r9, %rcx +; AVX2-NEXT: subq %r9, %rcx +; AVX2-NEXT: sarq $63, %r10 +; AVX2-NEXT: xorq %r10, %rdx +; AVX2-NEXT: subq %r10, %rdx +; AVX2-NEXT: sarq $63, %rdi +; AVX2-NEXT: xorq %rdi, %rsi +; AVX2-NEXT: subq %rdi, %rsi +; AVX2-NEXT: vmovq %rsi, %xmm0 +; AVX2-NEXT: vmovq %rdx, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_ext_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rdx +; AVX512-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512-NEXT: vmovq %xmm1, %r8 +; AVX512-NEXT: vpextrq $1, %xmm1, %r9 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %r10 +; AVX512-NEXT: vpextrq $1, %xmm0, %rdi +; AVX512-NEXT: xorl %r11d, %r11d +; AVX512-NEXT: subq %rdi, %rsi +; AVX512-NEXT: movl $0, %edi +; AVX512-NEXT: sbbq %rdi, %rdi +; AVX512-NEXT: subq %r10, %rdx +; AVX512-NEXT: movl $0, %r10d +; AVX512-NEXT: sbbq %r10, %r10 +; AVX512-NEXT: subq %r9, %rcx +; AVX512-NEXT: movl $0, %r9d +; AVX512-NEXT: sbbq %r9, %r9 +; AVX512-NEXT: subq %r8, %rax +; AVX512-NEXT: sbbq %r11, %r11 +; AVX512-NEXT: sarq $63, %r11 +; AVX512-NEXT: xorq %r11, %rax +; AVX512-NEXT: subq %r11, %rax +; AVX512-NEXT: sarq $63, %r9 +; AVX512-NEXT: xorq %r9, %rcx +; AVX512-NEXT: subq %r9, %rcx +; AVX512-NEXT: sarq $63, %r10 +; AVX512-NEXT: xorq %r10, %rdx +; AVX512-NEXT: subq %r10, %rdx +; AVX512-NEXT: sarq $63, %rdi +; AVX512-NEXT: xorq %rdi, %rsi +; AVX512-NEXT: subq %rdi, %rsi +; AVX512-NEXT: vmovq %rsi, %xmm0 +; AVX512-NEXT: vmovq %rdx, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vmovq %rcx, %xmm1 +; AVX512-NEXT: vmovq %rax, %xmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: retq + %aext = zext <4 x i64> %a to <4 x i128> + %bext = zext <4 x i64> %b to <4 x i128> + %sub = sub <4 x i128> %aext, %bext + %abs = call <4 x i128> @llvm.abs.v4i128(<4 x i128> %sub, i1 false) + %trunc = trunc <4 x i128> %abs to <4 x i64> + ret <4 x i64> %trunc +} + +define <4 x i64> @abd_ext_v4i64_undef(<4 x i64> %a, <4 x i64> %b) nounwind { +; AVX1-LABEL: abd_ext_v4i64_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rdx +; AVX1-NEXT: vpextrq $1, %xmm0, %rsi +; AVX1-NEXT: vmovq %xmm1, %r8 +; AVX1-NEXT: vpextrq $1, %xmm1, %r9 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %r10 +; AVX1-NEXT: vpextrq $1, %xmm0, %rdi +; AVX1-NEXT: xorl %r11d, %r11d +; AVX1-NEXT: subq %rdi, %rsi +; AVX1-NEXT: movl $0, %edi +; AVX1-NEXT: sbbq %rdi, %rdi +; AVX1-NEXT: subq %r10, %rdx +; AVX1-NEXT: movl $0, %r10d +; AVX1-NEXT: sbbq %r10, %r10 +; AVX1-NEXT: subq %r9, %rcx +; AVX1-NEXT: movl $0, %r9d +; AVX1-NEXT: sbbq %r9, %r9 +; AVX1-NEXT: subq %r8, %rax +; AVX1-NEXT: sbbq %r11, %r11 +; AVX1-NEXT: sarq $63, %r11 +; AVX1-NEXT: xorq %r11, %rax +; AVX1-NEXT: subq %r11, %rax +; AVX1-NEXT: sarq $63, %r9 +; AVX1-NEXT: xorq %r9, %rcx +; AVX1-NEXT: subq %r9, %rcx +; AVX1-NEXT: sarq $63, %r10 +; AVX1-NEXT: xorq %r10, %rdx +; AVX1-NEXT: subq %r10, %rdx +; AVX1-NEXT: sarq $63, %rdi +; AVX1-NEXT: xorq %rdi, %rsi +; AVX1-NEXT: subq %rdi, %rsi +; AVX1-NEXT: vmovq %rsi, %xmm0 +; AVX1-NEXT: vmovq %rdx, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovq %rcx, %xmm1 +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_ext_v4i64_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rdx +; AVX2-NEXT: vpextrq $1, %xmm0, %rsi +; AVX2-NEXT: vmovq %xmm1, %r8 +; AVX2-NEXT: vpextrq $1, %xmm1, %r9 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %r10 +; AVX2-NEXT: vpextrq $1, %xmm0, %rdi +; AVX2-NEXT: xorl %r11d, %r11d +; AVX2-NEXT: subq %rdi, %rsi +; AVX2-NEXT: movl $0, %edi +; AVX2-NEXT: sbbq %rdi, %rdi +; AVX2-NEXT: subq %r10, %rdx +; AVX2-NEXT: movl $0, %r10d +; AVX2-NEXT: sbbq %r10, %r10 +; AVX2-NEXT: subq %r9, %rcx +; AVX2-NEXT: movl $0, %r9d +; AVX2-NEXT: sbbq %r9, %r9 +; AVX2-NEXT: subq %r8, %rax +; AVX2-NEXT: sbbq %r11, %r11 +; AVX2-NEXT: sarq $63, %r11 +; AVX2-NEXT: xorq %r11, %rax +; AVX2-NEXT: subq %r11, %rax +; AVX2-NEXT: sarq $63, %r9 +; AVX2-NEXT: xorq %r9, %rcx +; AVX2-NEXT: subq %r9, %rcx +; AVX2-NEXT: sarq $63, %r10 +; AVX2-NEXT: xorq %r10, %rdx +; AVX2-NEXT: subq %r10, %rdx +; AVX2-NEXT: sarq $63, %rdi +; AVX2-NEXT: xorq %rdi, %rsi +; AVX2-NEXT: subq %rdi, %rsi +; AVX2-NEXT: vmovq %rsi, %xmm0 +; AVX2-NEXT: vmovq %rdx, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_ext_v4i64_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rdx +; AVX512-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512-NEXT: vmovq %xmm1, %r8 +; AVX512-NEXT: vpextrq $1, %xmm1, %r9 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %r10 +; AVX512-NEXT: vpextrq $1, %xmm0, %rdi +; AVX512-NEXT: xorl %r11d, %r11d +; AVX512-NEXT: subq %rdi, %rsi +; AVX512-NEXT: movl $0, %edi +; AVX512-NEXT: sbbq %rdi, %rdi +; AVX512-NEXT: subq %r10, %rdx +; AVX512-NEXT: movl $0, %r10d +; AVX512-NEXT: sbbq %r10, %r10 +; AVX512-NEXT: subq %r9, %rcx +; AVX512-NEXT: movl $0, %r9d +; AVX512-NEXT: sbbq %r9, %r9 +; AVX512-NEXT: subq %r8, %rax +; AVX512-NEXT: sbbq %r11, %r11 +; AVX512-NEXT: sarq $63, %r11 +; AVX512-NEXT: xorq %r11, %rax +; AVX512-NEXT: subq %r11, %rax +; AVX512-NEXT: sarq $63, %r9 +; AVX512-NEXT: xorq %r9, %rcx +; AVX512-NEXT: subq %r9, %rcx +; AVX512-NEXT: sarq $63, %r10 +; AVX512-NEXT: xorq %r10, %rdx +; AVX512-NEXT: subq %r10, %rdx +; AVX512-NEXT: sarq $63, %rdi +; AVX512-NEXT: xorq %rdi, %rsi +; AVX512-NEXT: subq %rdi, %rsi +; AVX512-NEXT: vmovq %rsi, %xmm0 +; AVX512-NEXT: vmovq %rdx, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vmovq %rcx, %xmm1 +; AVX512-NEXT: vmovq %rax, %xmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: retq + %aext = zext <4 x i64> %a to <4 x i128> + %bext = zext <4 x i64> %b to <4 x i128> + %sub = sub <4 x i128> %aext, %bext + %abs = call <4 x i128> @llvm.abs.v4i128(<4 x i128> %sub, i1 true) + %trunc = trunc <4 x i128> %abs to <4 x i64> + ret <4 x i64> %trunc +} + +; +; sub(umax(a,b),umin(a,b)) -> abdu(a,b) +; + +define <32 x i8> @abd_minmax_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { +; AVX1-LABEL: abd_minmax_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpminub %xmm3, %xmm4, %xmm5 +; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxub %xmm3, %xmm4, %xmm1 +; AVX1-NEXT: vpsubb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_minmax_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsubb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_minmax_v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm2 +; AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsubb %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: retq + %min = call <32 x i8> @llvm.umin.v32i8(<32 x i8> %a, <32 x i8> %b) + %max = call <32 x i8> @llvm.umax.v32i8(<32 x i8> %a, <32 x i8> %b) + %sub = sub <32 x i8> %max, %min + ret <32 x i8> %sub +} + +define <16 x i16> @abd_minmax_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { +; AVX1-LABEL: abd_minmax_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpminuw %xmm3, %xmm4, %xmm5 +; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxuw %xmm3, %xmm4, %xmm1 +; AVX1-NEXT: vpsubw %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_minmax_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsubw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_minmax_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm2 +; AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsubw %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: retq + %min = call <16 x i16> @llvm.umin.v16i16(<16 x i16> %a, <16 x i16> %b) + %max = call <16 x i16> @llvm.umax.v16i16(<16 x i16> %a, <16 x i16> %b) + %sub = sub <16 x i16> %max, %min + ret <16 x i16> %sub +} + +define <8 x i32> @abd_minmax_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { +; AVX1-LABEL: abd_minmax_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpminud %xmm3, %xmm4, %xmm5 +; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxud %xmm3, %xmm4, %xmm1 +; AVX1-NEXT: vpsubd %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_minmax_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_minmax_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm2 +; AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsubd %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: retq + %min = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %a, <8 x i32> %b) + %max = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %a, <8 x i32> %b) + %sub = sub <8 x i32> %max, %min + ret <8 x i32> %sub +} + +define <4 x i64> @abd_minmax_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { +; AVX1-LABEL: abd_minmax_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm1, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpxor %xmm2, %xmm6, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm8 +; AVX1-NEXT: vpxor %xmm2, %xmm8, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm9 +; AVX1-NEXT: vblendvpd %xmm9, %xmm6, %xmm8, %xmm9 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsubq %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm7, %xmm1 +; AVX1-NEXT: vblendvpd %xmm1, %xmm6, %xmm8, %xmm1 +; AVX1-NEXT: vpsubq %xmm9, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_minmax_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm4 +; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm1, %ymm4 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpsubq %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_minmax_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminuq %ymm1, %ymm0, %ymm2 +; AVX512-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsubq %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: retq + %min = call <4 x i64> @llvm.umin.v4i64(<4 x i64> %a, <4 x i64> %b) + %max = call <4 x i64> @llvm.umax.v4i64(<4 x i64> %a, <4 x i64> %b) + %sub = sub <4 x i64> %max, %min + ret <4 x i64> %sub +} + +; +; abs(sub_nsw(x, y)) -> abdu(a,b) +; + +define <32 x i8> @abd_subnsw_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { +; AVX1-LABEL: abd_subnsw_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpabsb %xmm0, %xmm0 +; AVX1-NEXT: vpabsb %xmm2, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_subnsw_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpabsb %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_subnsw_v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpabsb %ymm0, %ymm0 +; AVX512-NEXT: retq + %sub = sub nsw <32 x i8> %a, %b + %abs = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %sub, i1 false) + ret <32 x i8> %abs +} + +define <16 x i16> @abd_subnsw_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { +; AVX1-LABEL: abd_subnsw_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpabsw %xmm0, %xmm0 +; AVX1-NEXT: vpabsw %xmm2, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_subnsw_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpabsw %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_subnsw_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpabsw %ymm0, %ymm0 +; AVX512-NEXT: retq + %sub = sub nsw <16 x i16> %a, %b + %abs = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %sub, i1 false) + ret <16 x i16> %abs +} + +define <8 x i32> @abd_subnsw_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { +; AVX1-LABEL: abd_subnsw_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpabsd %xmm0, %xmm0 +; AVX1-NEXT: vpabsd %xmm2, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_subnsw_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpabsd %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_subnsw_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpabsd %ymm0, %ymm0 +; AVX512-NEXT: retq + %sub = sub nsw <8 x i32> %a, %b + %abs = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %sub, i1 false) + ret <8 x i32> %abs +} + +define <4 x i64> @abd_subnsw_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { +; AVX1-LABEL: abd_subnsw_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsubq %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vblendvpd %ymm1, %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: abd_subnsw_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm1 +; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: abd_subnsw_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpabsq %ymm0, %ymm0 +; AVX512-NEXT: retq + %sub = sub nsw <4 x i64> %a, %b + %abs = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %sub, i1 false) + ret <4 x i64> %abs +} + +declare <32 x i8> @llvm.abs.v32i8(<32 x i8>, i1) +declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1) +declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1) +declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1) +declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1) +declare <16 x i64> @llvm.abs.v16i64(<16 x i64>, i1) +declare <32 x i64> @llvm.abs.v32i64(<32 x i64>, i1) +declare <4 x i128> @llvm.abs.v4i128(<4 x i128>, i1) + +declare <32 x i8> @llvm.umax.v32i8(<32 x i8>, <32 x i8>) +declare <16 x i16> @llvm.umax.v16i16(<16 x i16>, <16 x i16>) +declare <8 x i32> @llvm.umax.v8i32(<8 x i32>, <8 x i32>) +declare <4 x i64> @llvm.umax.v4i64(<4 x i64>, <4 x i64>) + +declare <32 x i8> @llvm.umin.v32i8(<32 x i8>, <32 x i8>) +declare <16 x i16> @llvm.umin.v16i16(<16 x i16>, <16 x i16>) +declare <8 x i32> @llvm.umin.v8i32(<8 x i32>, <8 x i32>) +declare <4 x i64> @llvm.umin.v4i64(<4 x i64>, <4 x i64>) diff --git a/llvm/test/CodeGen/X86/abdu-vector-512.ll b/llvm/test/CodeGen/X86/abdu-vector-512.ll new file mode 100644 index 0000000..ab04fd6 --- /dev/null +++ b/llvm/test/CodeGen/X86/abdu-vector-512.ll @@ -0,0 +1,630 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ + +; +; trunc(abs(sub(zext(a),zext(b)))) -> abdu(a,b) +; + +define <64 x i8> @abd_ext_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { +; AVX512-LABEL: abd_ext_v64i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm7 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm10 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm10, %zmm3, %zmm3 +; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm9, %zmm2, %zmm2 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm9 +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm9 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero,xmm9[8],zero,xmm9[9],zero,xmm9[10],zero,xmm9[11],zero,xmm9[12],zero,xmm9[13],zero,xmm9[14],zero,xmm9[15],zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm10 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm10, %zmm5, %zmm5 +; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm9, %zmm4, %zmm4 +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm10 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm10, %zmm7, %zmm7 +; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm9, %zmm6, %zmm6 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm9, %zmm8, %zmm8 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm3, %zmm1 +; AVX512-NEXT: vpabsq %zmm2, %zmm2 +; AVX512-NEXT: vpabsq %zmm5, %zmm3 +; AVX512-NEXT: vpabsq %zmm4, %zmm4 +; AVX512-NEXT: vpabsq %zmm7, %zmm5 +; AVX512-NEXT: vpabsq %zmm6, %zmm6 +; AVX512-NEXT: vpabsq %zmm8, %zmm7 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpmovqb %zmm7, %xmm7 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm7[0],xmm0[0] +; AVX512-NEXT: vpmovqb %zmm6, %xmm6 +; AVX512-NEXT: vpmovqb %zmm5, %xmm5 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 +; AVX512-NEXT: vpmovqb %zmm4, %xmm4 +; AVX512-NEXT: vpmovqb %zmm3, %xmm3 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; AVX512-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: retq + %aext = zext <64 x i8> %a to <64 x i64> + %bext = zext <64 x i8> %b to <64 x i64> + %sub = sub <64 x i64> %aext, %bext + %abs = call <64 x i64> @llvm.abs.v64i64(<64 x i64> %sub, i1 false) + %trunc = trunc <64 x i64> %abs to <64 x i8> + ret <64 x i8> %trunc +} + +define <64 x i8> @abd_ext_v64i8_undef(<64 x i8> %a, <64 x i8> %b) nounwind { +; AVX512-LABEL: abd_ext_v64i8_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm7 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm10 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm10, %zmm3, %zmm3 +; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm9, %zmm2, %zmm2 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm9 +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm9 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero,xmm9[8],zero,xmm9[9],zero,xmm9[10],zero,xmm9[11],zero,xmm9[12],zero,xmm9[13],zero,xmm9[14],zero,xmm9[15],zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm10 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm10, %zmm5, %zmm5 +; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm9, %zmm4, %zmm4 +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm10 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm10, %zmm7, %zmm7 +; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm9, %zmm6, %zmm6 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm9, %zmm8, %zmm8 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm3, %zmm1 +; AVX512-NEXT: vpabsq %zmm2, %zmm2 +; AVX512-NEXT: vpabsq %zmm5, %zmm3 +; AVX512-NEXT: vpabsq %zmm4, %zmm4 +; AVX512-NEXT: vpabsq %zmm7, %zmm5 +; AVX512-NEXT: vpabsq %zmm6, %zmm6 +; AVX512-NEXT: vpabsq %zmm8, %zmm7 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpmovqb %zmm7, %xmm7 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm7[0],xmm0[0] +; AVX512-NEXT: vpmovqb %zmm6, %xmm6 +; AVX512-NEXT: vpmovqb %zmm5, %xmm5 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 +; AVX512-NEXT: vpmovqb %zmm4, %xmm4 +; AVX512-NEXT: vpmovqb %zmm3, %xmm3 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; AVX512-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: retq + %aext = zext <64 x i8> %a to <64 x i64> + %bext = zext <64 x i8> %b to <64 x i64> + %sub = sub <64 x i64> %aext, %bext + %abs = call <64 x i64> @llvm.abs.v64i64(<64 x i64> %sub, i1 true) + %trunc = trunc <64 x i64> %abs to <64 x i8> + ret <64 x i8> %trunc +} + +define <32 x i16> @abd_ext_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { +; AVX512-LABEL: abd_ext_v32i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm5, %zmm2, %zmm2 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm5, %zmm3, %zmm3 +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm5, %zmm4, %zmm4 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm2, %zmm1 +; AVX512-NEXT: vpabsq %zmm3, %zmm2 +; AVX512-NEXT: vpabsq %zmm4, %zmm3 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpmovqw %zmm3, %xmm3 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX512-NEXT: vpmovqw %zmm2, %xmm2 +; AVX512-NEXT: vpmovqw %zmm1, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: retq + %aext = zext <32 x i16> %a to <32 x i64> + %bext = zext <32 x i16> %b to <32 x i64> + %sub = sub <32 x i64> %aext, %bext + %abs = call <32 x i64> @llvm.abs.v32i64(<32 x i64> %sub, i1 false) + %trunc = trunc <32 x i64> %abs to <32 x i16> + ret <32 x i16> %trunc +} + +define <32 x i16> @abd_ext_v32i16_undef(<32 x i16> %a, <32 x i16> %b) nounwind { +; AVX512-LABEL: abd_ext_v32i16_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm5, %zmm2, %zmm2 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm5, %zmm3, %zmm3 +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm5, %zmm4, %zmm4 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm2, %zmm1 +; AVX512-NEXT: vpabsq %zmm3, %zmm2 +; AVX512-NEXT: vpabsq %zmm4, %zmm3 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpmovqw %zmm3, %xmm3 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX512-NEXT: vpmovqw %zmm2, %xmm2 +; AVX512-NEXT: vpmovqw %zmm1, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: retq + %aext = zext <32 x i16> %a to <32 x i64> + %bext = zext <32 x i16> %b to <32 x i64> + %sub = sub <32 x i64> %aext, %bext + %abs = call <32 x i64> @llvm.abs.v32i64(<32 x i64> %sub, i1 true) + %trunc = trunc <32 x i64> %abs to <32 x i16> + ret <32 x i16> %trunc +} + +define <16 x i32> @abd_ext_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { +; AVX512-LABEL: abd_ext_v16i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero +; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero +; AVX512-NEXT: vpsubq %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm2, %zmm1 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %aext = zext <16 x i32> %a to <16 x i64> + %bext = zext <16 x i32> %b to <16 x i64> + %sub = sub <16 x i64> %aext, %bext + %abs = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %sub, i1 false) + %trunc = trunc <16 x i64> %abs to <16 x i32> + ret <16 x i32> %trunc +} + +define <16 x i32> @abd_ext_v16i32_undef(<16 x i32> %a, <16 x i32> %b) nounwind { +; AVX512-LABEL: abd_ext_v16i32_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero +; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero +; AVX512-NEXT: vpsubq %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero +; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsq %zmm2, %zmm1 +; AVX512-NEXT: vpabsq %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %aext = zext <16 x i32> %a to <16 x i64> + %bext = zext <16 x i32> %b to <16 x i64> + %sub = sub <16 x i64> %aext, %bext + %abs = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %sub, i1 true) + %trunc = trunc <16 x i64> %abs to <16 x i32> + ret <16 x i32> %trunc +} + +define <8 x i64> @abd_ext_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { +; AVX512-LABEL: abd_ext_v8i64: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512-NEXT: vmovq %xmm2, %rax +; AVX512-NEXT: vpextrq $1, %xmm2, %rdx +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512-NEXT: vmovq %xmm2, %rsi +; AVX512-NEXT: vpextrq $1, %xmm2, %rdi +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, %r8 +; AVX512-NEXT: vpextrq $1, %xmm2, %r9 +; AVX512-NEXT: vpextrq $1, %xmm1, %rbx +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vmovq %xmm2, %rbp +; AVX512-NEXT: vpextrq $1, %xmm2, %r13 +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512-NEXT: vmovq %xmm2, %r15 +; AVX512-NEXT: vpextrq $1, %xmm2, %r14 +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, %r11 +; AVX512-NEXT: vpextrq $1, %xmm2, %r10 +; AVX512-NEXT: xorl %r12d, %r12d +; AVX512-NEXT: subq %r10, %r9 +; AVX512-NEXT: movl $0, %r10d +; AVX512-NEXT: sbbq %r10, %r10 +; AVX512-NEXT: subq %r11, %r8 +; AVX512-NEXT: movl $0, %r11d +; AVX512-NEXT: sbbq %r11, %r11 +; AVX512-NEXT: subq %r14, %rdi +; AVX512-NEXT: movl $0, %r14d +; AVX512-NEXT: sbbq %r14, %r14 +; AVX512-NEXT: subq %r15, %rsi +; AVX512-NEXT: movl $0, %r15d +; AVX512-NEXT: sbbq %r15, %r15 +; AVX512-NEXT: subq %r13, %rdx +; AVX512-NEXT: movl $0, %r13d +; AVX512-NEXT: sbbq %r13, %r13 +; AVX512-NEXT: subq %rbp, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movl $0, %ebp +; AVX512-NEXT: sbbq %rbp, %rbp +; AVX512-NEXT: subq %rbx, %rcx +; AVX512-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movl $0, %eax +; AVX512-NEXT: sbbq %rax, %rax +; AVX512-NEXT: vmovq %xmm0, %rbx +; AVX512-NEXT: vmovq %xmm1, %rcx +; AVX512-NEXT: subq %rcx, %rbx +; AVX512-NEXT: sbbq %r12, %r12 +; AVX512-NEXT: sarq $63, %r12 +; AVX512-NEXT: xorq %r12, %rbx +; AVX512-NEXT: subq %r12, %rbx +; AVX512-NEXT: sarq $63, %rax +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX512-NEXT: xorq %rax, %rcx +; AVX512-NEXT: subq %rax, %rcx +; AVX512-NEXT: movq %rcx, %rax +; AVX512-NEXT: sarq $63, %rbp +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX512-NEXT: xorq %rbp, %rcx +; AVX512-NEXT: subq %rbp, %rcx +; AVX512-NEXT: sarq $63, %r13 +; AVX512-NEXT: xorq %r13, %rdx +; AVX512-NEXT: subq %r13, %rdx +; AVX512-NEXT: sarq $63, %r15 +; AVX512-NEXT: xorq %r15, %rsi +; AVX512-NEXT: subq %r15, %rsi +; AVX512-NEXT: sarq $63, %r14 +; AVX512-NEXT: xorq %r14, %rdi +; AVX512-NEXT: subq %r14, %rdi +; AVX512-NEXT: sarq $63, %r11 +; AVX512-NEXT: xorq %r11, %r8 +; AVX512-NEXT: subq %r11, %r8 +; AVX512-NEXT: sarq $63, %r10 +; AVX512-NEXT: xorq %r10, %r9 +; AVX512-NEXT: subq %r10, %r9 +; AVX512-NEXT: vmovq %r9, %xmm0 +; AVX512-NEXT: vmovq %r8, %xmm1 +; AVX512-NEXT: vmovq %rdi, %xmm2 +; AVX512-NEXT: vmovq %rsi, %xmm3 +; AVX512-NEXT: vmovq %rdx, %xmm4 +; AVX512-NEXT: vmovq %rcx, %xmm5 +; AVX512-NEXT: vmovq %rax, %xmm6 +; AVX512-NEXT: vmovq %rbx, %xmm7 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0] +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq + %aext = zext <8 x i64> %a to <8 x i128> + %bext = zext <8 x i64> %b to <8 x i128> + %sub = sub <8 x i128> %aext, %bext + %abs = call <8 x i128> @llvm.abs.v8i128(<8 x i128> %sub, i1 false) + %trunc = trunc <8 x i128> %abs to <8 x i64> + ret <8 x i64> %trunc +} + +define <8 x i64> @abd_ext_v8i64_undef(<8 x i64> %a, <8 x i64> %b) nounwind { +; AVX512-LABEL: abd_ext_v8i64_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512-NEXT: vmovq %xmm2, %rax +; AVX512-NEXT: vpextrq $1, %xmm2, %rdx +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512-NEXT: vmovq %xmm2, %rsi +; AVX512-NEXT: vpextrq $1, %xmm2, %rdi +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, %r8 +; AVX512-NEXT: vpextrq $1, %xmm2, %r9 +; AVX512-NEXT: vpextrq $1, %xmm1, %rbx +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vmovq %xmm2, %rbp +; AVX512-NEXT: vpextrq $1, %xmm2, %r13 +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512-NEXT: vmovq %xmm2, %r15 +; AVX512-NEXT: vpextrq $1, %xmm2, %r14 +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, %r11 +; AVX512-NEXT: vpextrq $1, %xmm2, %r10 +; AVX512-NEXT: xorl %r12d, %r12d +; AVX512-NEXT: subq %r10, %r9 +; AVX512-NEXT: movl $0, %r10d +; AVX512-NEXT: sbbq %r10, %r10 +; AVX512-NEXT: subq %r11, %r8 +; AVX512-NEXT: movl $0, %r11d +; AVX512-NEXT: sbbq %r11, %r11 +; AVX512-NEXT: subq %r14, %rdi +; AVX512-NEXT: movl $0, %r14d +; AVX512-NEXT: sbbq %r14, %r14 +; AVX512-NEXT: subq %r15, %rsi +; AVX512-NEXT: movl $0, %r15d +; AVX512-NEXT: sbbq %r15, %r15 +; AVX512-NEXT: subq %r13, %rdx +; AVX512-NEXT: movl $0, %r13d +; AVX512-NEXT: sbbq %r13, %r13 +; AVX512-NEXT: subq %rbp, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movl $0, %ebp +; AVX512-NEXT: sbbq %rbp, %rbp +; AVX512-NEXT: subq %rbx, %rcx +; AVX512-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movl $0, %eax +; AVX512-NEXT: sbbq %rax, %rax +; AVX512-NEXT: vmovq %xmm0, %rbx +; AVX512-NEXT: vmovq %xmm1, %rcx +; AVX512-NEXT: subq %rcx, %rbx +; AVX512-NEXT: sbbq %r12, %r12 +; AVX512-NEXT: sarq $63, %r12 +; AVX512-NEXT: xorq %r12, %rbx +; AVX512-NEXT: subq %r12, %rbx +; AVX512-NEXT: sarq $63, %rax +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX512-NEXT: xorq %rax, %rcx +; AVX512-NEXT: subq %rax, %rcx +; AVX512-NEXT: movq %rcx, %rax +; AVX512-NEXT: sarq $63, %rbp +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX512-NEXT: xorq %rbp, %rcx +; AVX512-NEXT: subq %rbp, %rcx +; AVX512-NEXT: sarq $63, %r13 +; AVX512-NEXT: xorq %r13, %rdx +; AVX512-NEXT: subq %r13, %rdx +; AVX512-NEXT: sarq $63, %r15 +; AVX512-NEXT: xorq %r15, %rsi +; AVX512-NEXT: subq %r15, %rsi +; AVX512-NEXT: sarq $63, %r14 +; AVX512-NEXT: xorq %r14, %rdi +; AVX512-NEXT: subq %r14, %rdi +; AVX512-NEXT: sarq $63, %r11 +; AVX512-NEXT: xorq %r11, %r8 +; AVX512-NEXT: subq %r11, %r8 +; AVX512-NEXT: sarq $63, %r10 +; AVX512-NEXT: xorq %r10, %r9 +; AVX512-NEXT: subq %r10, %r9 +; AVX512-NEXT: vmovq %r9, %xmm0 +; AVX512-NEXT: vmovq %r8, %xmm1 +; AVX512-NEXT: vmovq %rdi, %xmm2 +; AVX512-NEXT: vmovq %rsi, %xmm3 +; AVX512-NEXT: vmovq %rdx, %xmm4 +; AVX512-NEXT: vmovq %rcx, %xmm5 +; AVX512-NEXT: vmovq %rax, %xmm6 +; AVX512-NEXT: vmovq %rbx, %xmm7 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0] +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq + %aext = zext <8 x i64> %a to <8 x i128> + %bext = zext <8 x i64> %b to <8 x i128> + %sub = sub <8 x i128> %aext, %bext + %abs = call <8 x i128> @llvm.abs.v8i128(<8 x i128> %sub, i1 true) + %trunc = trunc <8 x i128> %abs to <8 x i64> + ret <8 x i64> %trunc +} + +; +; sub(umax(a,b),umin(a,b)) -> abdu(a,b) +; + +define <64 x i8> @abd_minmax_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { +; AVX512BW-LABEL: abd_minmax_v64i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpminub %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: abd_minmax_v64i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpminub %ymm1, %ymm0, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512DQ-NEXT: vpminub %ymm3, %ymm4, %ymm5 +; AVX512DQ-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsubb %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmaxub %ymm3, %ymm4, %ymm1 +; AVX512DQ-NEXT: vpsubb %ymm5, %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: retq + %min = call <64 x i8> @llvm.umin.v64i8(<64 x i8> %a, <64 x i8> %b) + %max = call <64 x i8> @llvm.umax.v64i8(<64 x i8> %a, <64 x i8> %b) + %sub = sub <64 x i8> %max, %min + ret <64 x i8> %sub +} + +define <32 x i16> @abd_minmax_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { +; AVX512BW-LABEL: abd_minmax_v32i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpminuw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsubw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: abd_minmax_v32i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpminuw %ymm1, %ymm0, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512DQ-NEXT: vpminuw %ymm3, %ymm4, %ymm5 +; AVX512DQ-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsubw %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmaxuw %ymm3, %ymm4, %ymm1 +; AVX512DQ-NEXT: vpsubw %ymm5, %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: retq + %min = call <32 x i16> @llvm.umin.v32i16(<32 x i16> %a, <32 x i16> %b) + %max = call <32 x i16> @llvm.umax.v32i16(<32 x i16> %a, <32 x i16> %b) + %sub = sub <32 x i16> %max, %min + ret <32 x i16> %sub +} + +define <16 x i32> @abd_minmax_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { +; AVX512-LABEL: abd_minmax_v16i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm2 +; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsubd %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: retq + %min = call <16 x i32> @llvm.umin.v16i32(<16 x i32> %a, <16 x i32> %b) + %max = call <16 x i32> @llvm.umax.v16i32(<16 x i32> %a, <16 x i32> %b) + %sub = sub <16 x i32> %max, %min + ret <16 x i32> %sub +} + +define <8 x i64> @abd_minmax_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { +; AVX512-LABEL: abd_minmax_v8i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm2 +; AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: retq + %min = call <8 x i64> @llvm.umin.v8i64(<8 x i64> %a, <8 x i64> %b) + %max = call <8 x i64> @llvm.umax.v8i64(<8 x i64> %a, <8 x i64> %b) + %sub = sub <8 x i64> %max, %min + ret <8 x i64> %sub +} + +declare <64 x i8> @llvm.abs.v64i8(<64 x i8>, i1) +declare <32 x i16> @llvm.abs.v32i16(<32 x i16>, i1) +declare <16 x i32> @llvm.abs.v16i32(<16 x i32>, i1) +declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1) +declare <16 x i64> @llvm.abs.v16i64(<16 x i64>, i1) +declare <32 x i64> @llvm.abs.v32i64(<32 x i64>, i1) +declare <64 x i64> @llvm.abs.v64i64(<64 x i64>, i1) +declare <8 x i128> @llvm.abs.v8i128(<8 x i128>, i1) + +declare <64 x i8> @llvm.umax.v64i8(<64 x i8>, <64 x i8>) +declare <32 x i16> @llvm.umax.v32i16(<32 x i16>, <32 x i16>) +declare <16 x i32> @llvm.umax.v16i32(<16 x i32>, <16 x i32>) +declare <8 x i64> @llvm.umax.v8i64(<8 x i64>, <8 x i64>) + +declare <64 x i8> @llvm.umin.v64i8(<64 x i8>, <64 x i8>) +declare <32 x i16> @llvm.umin.v32i16(<32 x i16>, <32 x i16>) +declare <16 x i32> @llvm.umin.v16i32(<16 x i32>, <16 x i32>) +declare <8 x i64> @llvm.umin.v8i64(<8 x i64>, <8 x i64>) -- 2.7.4