From: Craig Topper Date: Mon, 26 Feb 2018 02:16:31 +0000 (+0000) Subject: [X86] Remove VT.isSimple() check from detectAVGPattern. X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=79d189f5970e43cac372403cdf329d6ba04e6bf1;p=platform%2Fupstream%2Fllvm.git [X86] Remove VT.isSimple() check from detectAVGPattern. Which types are considered 'simple' is a function of the requirements of all targets that LLVM supports. That shouldn't directly affect what types we are able to handle. The remainder of this code checks that the number of elements is a power of 2 and takes care of splitting down to a legal size. llvm-svn: 326063 --- diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 2cce366..f2a88ef 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -34369,7 +34369,7 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL) { - if (!VT.isVector() || !VT.isSimple()) + if (!VT.isVector()) return SDValue(); EVT InVT = In.getValueType(); unsigned NumElems = VT.getVectorNumElements(); diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll index 8e1e5f3..08911fb 100644 --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -1651,3 +1651,375 @@ define <64 x i8> @avg_v64i8_3(<64 x i8> %a, <64 x i8> %b) nounwind { %res = trunc <64 x i16> %lshr to <64 x i8> ret <64 x i8> %res } + +define <512 x i8> @avg_v512i8_3(<512 x i8> %a, <512 x i8> %b) nounwind { +; SSE2-LABEL: avg_v512i8_3: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: movdqa %xmm8, 496(%rdi) +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: movdqa %xmm8, 480(%rdi) +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: movdqa %xmm8, 464(%rdi) +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: movdqa %xmm8, 448(%rdi) +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: movdqa %xmm8, 432(%rdi) +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: movdqa %xmm8, 416(%rdi) +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: movdqa %xmm8, 400(%rdi) +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: movdqa %xmm8, 384(%rdi) +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: movdqa %xmm8, 368(%rdi) +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: movdqa %xmm8, 352(%rdi) +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: movdqa %xmm8, 336(%rdi) +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: movdqa %xmm8, 320(%rdi) +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: movdqa %xmm8, 304(%rdi) +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: movdqa %xmm8, 288(%rdi) +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: movdqa %xmm8, 272(%rdi) +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: movdqa %xmm8, 256(%rdi) +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: movdqa %xmm8, 240(%rdi) +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: movdqa %xmm8, 224(%rdi) +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: movdqa %xmm8, 208(%rdi) +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: movdqa %xmm8, 192(%rdi) +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: movdqa %xmm8, 176(%rdi) +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: movdqa %xmm8, 160(%rdi) +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: movdqa %xmm8, 144(%rdi) +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: movdqa %xmm8, 128(%rdi) +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm7 +; SSE2-NEXT: movdqa %xmm7, 112(%rdi) +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm6 +; SSE2-NEXT: movdqa %xmm6, 96(%rdi) +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm5 +; SSE2-NEXT: movdqa %xmm5, 80(%rdi) +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm4 +; SSE2-NEXT: movdqa %xmm4, 64(%rdi) +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 48(%rdi) +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 32(%rdi) +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 16(%rdi) +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rdi) +; SSE2-NEXT: movq %rdi, %rax +; SSE2-NEXT: retq +; +; AVX1-LABEL: avg_v512i8_3: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $128, %rsp +; AVX1-NEXT: vmovdqa 144(%rbp), %ymm8 +; AVX1-NEXT: vmovdqa 112(%rbp), %ymm9 +; AVX1-NEXT: vmovdqa 80(%rbp), %ymm10 +; AVX1-NEXT: vmovdqa 48(%rbp), %ymm11 +; AVX1-NEXT: vmovdqa 16(%rbp), %ymm12 +; AVX1-NEXT: vmovdqa 272(%rbp), %ymm13 +; AVX1-NEXT: vextractf128 $1, %ymm13, %xmm14 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm15 +; AVX1-NEXT: vpavgb %xmm14, %xmm15, %xmm14 +; AVX1-NEXT: vmovdqa 304(%rbp), %ymm15 +; AVX1-NEXT: vpavgb %xmm13, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm15, %xmm14 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpavgb %xmm14, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa 336(%rbp), %ymm14 +; AVX1-NEXT: vpavgb %xmm15, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa 368(%rbp), %ymm1 +; AVX1-NEXT: vpavgb %xmm14, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsp) # 32-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vpavgb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vmovdqa 400(%rbp), %ymm2 +; AVX1-NEXT: vpavgb %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm1 +; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa 432(%rbp), %ymm1 +; AVX1-NEXT: vpavgb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm2 +; AVX1-NEXT: vpavgb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vmovdqa 464(%rbp), %ymm2 +; AVX1-NEXT: vpavgb %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm5 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm1 +; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa 496(%rbp), %ymm1 +; AVX1-NEXT: vpavgb %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm6 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm2 +; AVX1-NEXT: vpavgb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vmovdqa 528(%rbp), %ymm2 +; AVX1-NEXT: vpavgb %xmm1, %xmm7, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm7 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm1 +; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa 560(%rbp), %ymm1 +; AVX1-NEXT: vpavgb %xmm2, %xmm12, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm12 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm11, %xmm2 +; AVX1-NEXT: vpavgb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vmovdqa 592(%rbp), %ymm2 +; AVX1-NEXT: vpavgb %xmm1, %xmm11, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm11 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm1 +; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa 624(%rbp), %ymm1 +; AVX1-NEXT: vpavgb %xmm2, %xmm10, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm10 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm2 +; AVX1-NEXT: vpavgb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vmovdqa 656(%rbp), %ymm2 +; AVX1-NEXT: vpavgb %xmm1, %xmm9, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm9 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm1 +; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa 176(%rbp), %ymm1 +; AVX1-NEXT: vpavgb %xmm2, %xmm8, %xmm2 +; AVX1-NEXT: vmovdqa 688(%rbp), %ymm8 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm13 +; AVX1-NEXT: vpavgb %xmm2, %xmm13, %xmm2 +; AVX1-NEXT: vpavgb %xmm8, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa 208(%rbp), %ymm8 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm13 +; AVX1-NEXT: vmovdqa 720(%rbp), %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm15 +; AVX1-NEXT: vpavgb %xmm1, %xmm15, %xmm1 +; AVX1-NEXT: vpavgb %xmm2, %xmm8, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vmovdqa 240(%rbp), %ymm15 +; AVX1-NEXT: vmovdqa 752(%rbp), %ymm8 +; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm15, %xmm14 +; AVX1-NEXT: vpavgb %xmm2, %xmm14, %xmm2 +; AVX1-NEXT: vpavgb %xmm8, %xmm15, %xmm8 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm8, %ymm2 +; AVX1-NEXT: vmovaps %ymm2, 480(%rdi) +; AVX1-NEXT: vmovaps %ymm1, 448(%rdi) +; AVX1-NEXT: vmovaps %ymm13, 416(%rdi) +; AVX1-NEXT: vmovaps %ymm0, 384(%rdi) +; AVX1-NEXT: vmovaps %ymm9, 352(%rdi) +; AVX1-NEXT: vmovaps %ymm10, 320(%rdi) +; AVX1-NEXT: vmovaps %ymm11, 288(%rdi) +; AVX1-NEXT: vmovaps %ymm12, 256(%rdi) +; AVX1-NEXT: vmovaps %ymm7, 224(%rdi) +; AVX1-NEXT: vmovaps %ymm6, 192(%rdi) +; AVX1-NEXT: vmovaps %ymm5, 160(%rdi) +; AVX1-NEXT: vmovaps %ymm4, 128(%rdi) +; AVX1-NEXT: vmovaps %ymm3, 96(%rdi) +; AVX1-NEXT: vmovaps (%rsp), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, 64(%rdi) +; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, 32(%rdi) +; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, (%rdi) +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v512i8_3: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $32, %rsp +; AVX2-NEXT: vmovdqa 240(%rbp), %ymm8 +; AVX2-NEXT: vmovdqa 208(%rbp), %ymm9 +; AVX2-NEXT: vmovdqa 176(%rbp), %ymm10 +; AVX2-NEXT: vmovdqa 144(%rbp), %ymm11 +; AVX2-NEXT: vmovdqa 112(%rbp), %ymm12 +; AVX2-NEXT: vmovdqa 80(%rbp), %ymm13 +; AVX2-NEXT: vmovdqa 48(%rbp), %ymm14 +; AVX2-NEXT: vmovdqa 16(%rbp), %ymm15 +; AVX2-NEXT: vpavgb 272(%rbp), %ymm0, %ymm0 +; AVX2-NEXT: vpavgb 304(%rbp), %ymm1, %ymm1 +; AVX2-NEXT: vpavgb 336(%rbp), %ymm2, %ymm2 +; AVX2-NEXT: vpavgb 368(%rbp), %ymm3, %ymm3 +; AVX2-NEXT: vpavgb 400(%rbp), %ymm4, %ymm4 +; AVX2-NEXT: vpavgb 432(%rbp), %ymm5, %ymm5 +; AVX2-NEXT: vpavgb 464(%rbp), %ymm6, %ymm6 +; AVX2-NEXT: vpavgb 496(%rbp), %ymm7, %ymm7 +; AVX2-NEXT: vpavgb 528(%rbp), %ymm15, %ymm15 +; AVX2-NEXT: vpavgb 560(%rbp), %ymm14, %ymm14 +; AVX2-NEXT: vpavgb 592(%rbp), %ymm13, %ymm13 +; AVX2-NEXT: vpavgb 624(%rbp), %ymm12, %ymm12 +; AVX2-NEXT: vpavgb 656(%rbp), %ymm11, %ymm11 +; AVX2-NEXT: vpavgb 688(%rbp), %ymm10, %ymm10 +; AVX2-NEXT: vpavgb 720(%rbp), %ymm9, %ymm9 +; AVX2-NEXT: vpavgb 752(%rbp), %ymm8, %ymm8 +; AVX2-NEXT: vmovdqa %ymm8, 480(%rdi) +; AVX2-NEXT: vmovdqa %ymm9, 448(%rdi) +; AVX2-NEXT: vmovdqa %ymm10, 416(%rdi) +; AVX2-NEXT: vmovdqa %ymm11, 384(%rdi) +; AVX2-NEXT: vmovdqa %ymm12, 352(%rdi) +; AVX2-NEXT: vmovdqa %ymm13, 320(%rdi) +; AVX2-NEXT: vmovdqa %ymm14, 288(%rdi) +; AVX2-NEXT: vmovdqa %ymm15, 256(%rdi) +; AVX2-NEXT: vmovdqa %ymm7, 224(%rdi) +; AVX2-NEXT: vmovdqa %ymm6, 192(%rdi) +; AVX2-NEXT: vmovdqa %ymm5, 160(%rdi) +; AVX2-NEXT: vmovdqa %ymm4, 128(%rdi) +; AVX2-NEXT: vmovdqa %ymm3, 96(%rdi) +; AVX2-NEXT: vmovdqa %ymm2, 64(%rdi) +; AVX2-NEXT: vmovdqa %ymm1, 32(%rdi) +; AVX2-NEXT: vmovdqa %ymm0, (%rdi) +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: avg_v512i8_3: +; AVX512F: # %bb.0: +; AVX512F-NEXT: pushq %rbp +; AVX512F-NEXT: movq %rsp, %rbp +; AVX512F-NEXT: andq $-32, %rsp +; AVX512F-NEXT: subq $32, %rsp +; AVX512F-NEXT: vmovdqa 240(%rbp), %ymm8 +; AVX512F-NEXT: vmovdqa 208(%rbp), %ymm9 +; AVX512F-NEXT: vmovdqa 176(%rbp), %ymm10 +; AVX512F-NEXT: vmovdqa 144(%rbp), %ymm11 +; AVX512F-NEXT: vmovdqa 112(%rbp), %ymm12 +; AVX512F-NEXT: vmovdqa 80(%rbp), %ymm13 +; AVX512F-NEXT: vmovdqa 48(%rbp), %ymm14 +; AVX512F-NEXT: vmovdqa 16(%rbp), %ymm15 +; AVX512F-NEXT: vpavgb 272(%rbp), %ymm0, %ymm0 +; AVX512F-NEXT: vpavgb 304(%rbp), %ymm1, %ymm1 +; AVX512F-NEXT: vpavgb 336(%rbp), %ymm2, %ymm2 +; AVX512F-NEXT: vpavgb 368(%rbp), %ymm3, %ymm3 +; AVX512F-NEXT: vpavgb 400(%rbp), %ymm4, %ymm4 +; AVX512F-NEXT: vpavgb 432(%rbp), %ymm5, %ymm5 +; AVX512F-NEXT: vpavgb 464(%rbp), %ymm6, %ymm6 +; AVX512F-NEXT: vpavgb 496(%rbp), %ymm7, %ymm7 +; AVX512F-NEXT: vpavgb 528(%rbp), %ymm15, %ymm15 +; AVX512F-NEXT: vpavgb 560(%rbp), %ymm14, %ymm14 +; AVX512F-NEXT: vpavgb 592(%rbp), %ymm13, %ymm13 +; AVX512F-NEXT: vpavgb 624(%rbp), %ymm12, %ymm12 +; AVX512F-NEXT: vpavgb 656(%rbp), %ymm11, %ymm11 +; AVX512F-NEXT: vpavgb 688(%rbp), %ymm10, %ymm10 +; AVX512F-NEXT: vpavgb 720(%rbp), %ymm9, %ymm9 +; AVX512F-NEXT: vpavgb 752(%rbp), %ymm8, %ymm8 +; AVX512F-NEXT: vmovdqa %ymm8, 480(%rdi) +; AVX512F-NEXT: vmovdqa %ymm9, 448(%rdi) +; AVX512F-NEXT: vmovdqa %ymm10, 416(%rdi) +; AVX512F-NEXT: vmovdqa %ymm11, 384(%rdi) +; AVX512F-NEXT: vmovdqa %ymm12, 352(%rdi) +; AVX512F-NEXT: vmovdqa %ymm13, 320(%rdi) +; AVX512F-NEXT: vmovdqa %ymm14, 288(%rdi) +; AVX512F-NEXT: vmovdqa %ymm15, 256(%rdi) +; AVX512F-NEXT: vmovdqa %ymm7, 224(%rdi) +; AVX512F-NEXT: vmovdqa %ymm6, 192(%rdi) +; AVX512F-NEXT: vmovdqa %ymm5, 160(%rdi) +; AVX512F-NEXT: vmovdqa %ymm4, 128(%rdi) +; AVX512F-NEXT: vmovdqa %ymm3, 96(%rdi) +; AVX512F-NEXT: vmovdqa %ymm2, 64(%rdi) +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdi) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdi) +; AVX512F-NEXT: movq %rdi, %rax +; AVX512F-NEXT: movq %rbp, %rsp +; AVX512F-NEXT: popq %rbp +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: avg_v512i8_3: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: pushq %rbp +; AVX512BW-NEXT: movq %rsp, %rbp +; AVX512BW-NEXT: andq $-64, %rsp +; AVX512BW-NEXT: subq $64, %rsp +; AVX512BW-NEXT: vpavgb 16(%rbp), %zmm0, %zmm0 +; AVX512BW-NEXT: vpavgb 80(%rbp), %zmm1, %zmm1 +; AVX512BW-NEXT: vpavgb 144(%rbp), %zmm2, %zmm2 +; AVX512BW-NEXT: vpavgb 208(%rbp), %zmm3, %zmm3 +; AVX512BW-NEXT: vpavgb 272(%rbp), %zmm4, %zmm4 +; AVX512BW-NEXT: vpavgb 336(%rbp), %zmm5, %zmm5 +; AVX512BW-NEXT: vpavgb 400(%rbp), %zmm6, %zmm6 +; AVX512BW-NEXT: vpavgb 464(%rbp), %zmm7, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdi) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdi) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdi) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdi) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdi) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdi) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdi) +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdi) +; AVX512BW-NEXT: movq %rdi, %rax +; AVX512BW-NEXT: movq %rbp, %rsp +; AVX512BW-NEXT: popq %rbp +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %za = zext <512 x i8> %a to <512 x i16> + %zb = zext <512 x i8> %b to <512 x i16> + %add = add nuw nsw <512 x i16> %za, %zb + %add1 = add nuw nsw <512 x i16> %add, + %lshr = lshr <512 x i16> %add1, + %res = trunc <512 x i16> %lshr to <512 x i8> + ret <512 x i8> %res +} diff --git a/llvm/test/CodeGen/X86/bitcast-setcc-128.ll b/llvm/test/CodeGen/X86/bitcast-setcc-128.ll index 26ba69a..2deb32d 100644 --- a/llvm/test/CodeGen/X86/bitcast-setcc-128.ll +++ b/llvm/test/CodeGen/X86/bitcast-setcc-128.ll @@ -641,3 +641,179 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b) { %res = bitcast <8 x i1> %x to i8 ret i8 %res } + +define i64 @v16i8_widened_with_zeroes(<16 x i8> %a, <16 x i8> %b) { +; SSE2-SSSE3-LABEL: v16i8_widened_with_zeroes: +; SSE2-SSSE3: # %bb.0: # %entry +; SSE2-SSSE3-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: andl $1, %eax +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: leal (%rcx,%rax,2), %eax +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: leal (%rax,%rcx,4), %eax +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: leal (%rax,%rcx,8), %eax +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: shll $4, %ecx +; SSE2-SSSE3-NEXT: orl %eax, %ecx +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: andl $1, %eax +; SSE2-SSSE3-NEXT: shll $5, %eax +; SSE2-SSSE3-NEXT: orl %ecx, %eax +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: shll $6, %ecx +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE2-SSSE3-NEXT: andl $1, %edx +; SSE2-SSSE3-NEXT: shll $7, %edx +; SSE2-SSSE3-NEXT: orl %ecx, %edx +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: shll $8, %ecx +; SSE2-SSSE3-NEXT: orl %edx, %ecx +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE2-SSSE3-NEXT: andl $1, %edx +; SSE2-SSSE3-NEXT: shll $9, %edx +; SSE2-SSSE3-NEXT: orl %ecx, %edx +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: shll $10, %ecx +; SSE2-SSSE3-NEXT: orl %edx, %ecx +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE2-SSSE3-NEXT: andl $1, %edx +; SSE2-SSSE3-NEXT: shll $11, %edx +; SSE2-SSSE3-NEXT: orl %ecx, %edx +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: shll $12, %ecx +; SSE2-SSSE3-NEXT: orl %edx, %ecx +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE2-SSSE3-NEXT: andl $1, %edx +; SSE2-SSSE3-NEXT: shll $13, %edx +; SSE2-SSSE3-NEXT: orl %ecx, %edx +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: shll $14, %ecx +; SSE2-SSSE3-NEXT: orl %edx, %ecx +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE2-SSSE3-NEXT: shll $15, %edx +; SSE2-SSSE3-NEXT: orl %ecx, %edx +; SSE2-SSSE3-NEXT: orl %eax, %edx +; SSE2-SSSE3-NEXT: movw %dx, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %ecx +; SSE2-SSSE3-NEXT: movw $0, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %edx +; SSE2-SSSE3-NEXT: movl %edx, %eax +; SSE2-SSSE3-NEXT: shll $16, %eax +; SSE2-SSSE3-NEXT: orl %eax, %edx +; SSE2-SSSE3-NEXT: shlq $32, %rdx +; SSE2-SSSE3-NEXT: orl %ecx, %eax +; SSE2-SSSE3-NEXT: orq %rdx, %rax +; SSE2-SSSE3-NEXT: retq +; +; AVX12-LABEL: v16i8_widened_with_zeroes: +; AVX12: # %bb.0: # %entry +; AVX12-NEXT: pushq %rbp +; AVX12-NEXT: .cfi_def_cfa_offset 16 +; AVX12-NEXT: .cfi_offset %rbp, -16 +; AVX12-NEXT: movq %rsp, %rbp +; AVX12-NEXT: .cfi_def_cfa_register %rbp +; AVX12-NEXT: andq $-32, %rsp +; AVX12-NEXT: subq $64, %rsp +; AVX12-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpextrb $1, %xmm0, %eax +; AVX12-NEXT: andl $1, %eax +; AVX12-NEXT: vpextrb $0, %xmm0, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: leal (%rcx,%rax,2), %eax +; AVX12-NEXT: vpextrb $2, %xmm0, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: leal (%rax,%rcx,4), %eax +; AVX12-NEXT: vpextrb $3, %xmm0, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: leal (%rax,%rcx,8), %eax +; AVX12-NEXT: vpextrb $4, %xmm0, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: shll $4, %ecx +; AVX12-NEXT: orl %eax, %ecx +; AVX12-NEXT: vpextrb $5, %xmm0, %eax +; AVX12-NEXT: andl $1, %eax +; AVX12-NEXT: shll $5, %eax +; AVX12-NEXT: orl %ecx, %eax +; AVX12-NEXT: vpextrb $6, %xmm0, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: shll $6, %ecx +; AVX12-NEXT: vpextrb $7, %xmm0, %edx +; AVX12-NEXT: andl $1, %edx +; AVX12-NEXT: shll $7, %edx +; AVX12-NEXT: orl %ecx, %edx +; AVX12-NEXT: vpextrb $8, %xmm0, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: shll $8, %ecx +; AVX12-NEXT: orl %edx, %ecx +; AVX12-NEXT: vpextrb $9, %xmm0, %edx +; AVX12-NEXT: andl $1, %edx +; AVX12-NEXT: shll $9, %edx +; AVX12-NEXT: orl %ecx, %edx +; AVX12-NEXT: vpextrb $10, %xmm0, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: shll $10, %ecx +; AVX12-NEXT: orl %edx, %ecx +; AVX12-NEXT: vpextrb $11, %xmm0, %edx +; AVX12-NEXT: andl $1, %edx +; AVX12-NEXT: shll $11, %edx +; AVX12-NEXT: orl %ecx, %edx +; AVX12-NEXT: vpextrb $12, %xmm0, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: shll $12, %ecx +; AVX12-NEXT: orl %edx, %ecx +; AVX12-NEXT: vpextrb $13, %xmm0, %edx +; AVX12-NEXT: andl $1, %edx +; AVX12-NEXT: shll $13, %edx +; AVX12-NEXT: orl %ecx, %edx +; AVX12-NEXT: vpextrb $14, %xmm0, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: shll $14, %ecx +; AVX12-NEXT: orl %edx, %ecx +; AVX12-NEXT: vpextrb $15, %xmm0, %edx +; AVX12-NEXT: andl $1, %edx +; AVX12-NEXT: shll $15, %edx +; AVX12-NEXT: orl %ecx, %edx +; AVX12-NEXT: orl %eax, %edx +; AVX12-NEXT: movl %edx, (%rsp) +; AVX12-NEXT: movl $0, {{[0-9]+}}(%rsp) +; AVX12-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; AVX12-NEXT: shlq $32, %rcx +; AVX12-NEXT: movl (%rsp), %eax +; AVX12-NEXT: orq %rcx, %rax +; AVX12-NEXT: movq %rbp, %rsp +; AVX12-NEXT: popq %rbp +; AVX12-NEXT: retq +; +; AVX512F-LABEL: v16i8_widened_with_zeroes: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: movzwl %ax, %eax +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v16i8_widened_with_zeroes: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 +; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: retq +entry: + %c = icmp eq <16 x i8> %a, %b + %d = shufflevector <16 x i1> %c, <16 x i1> zeroinitializer, <64 x i32> + %e = bitcast <64 x i1> %d to i64 + ret i64 %e +}