From e4fe866949951c8eb79c5ebdb0a6dab37cef37a9 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Tue, 10 Aug 2010 17:06:05 -0400 Subject: [PATCH] Added ssse3 version of sixtap filters Improved decoder performance by 9% for the clip used. Change-Id: I8fc5609213b7bef10248372595dc85b29f9895b9 --- vp8/common/x86/subpixel_ssse3.asm | 957 +++++++++++++++++++++++++++++++++++ vp8/common/x86/subpixel_x86.h | 33 ++ vp8/common/x86/vp8_asm_stubs.c | 169 +++++++ vp8/common/x86/x86_systemdependent.c | 13 + vp8/vp8_common.mk | 1 + 5 files changed, 1173 insertions(+) create mode 100644 vp8/common/x86/subpixel_ssse3.asm diff --git a/vp8/common/x86/subpixel_ssse3.asm b/vp8/common/x86/subpixel_ssse3.asm new file mode 100644 index 0000000..f45c754 --- /dev/null +++ b/vp8/common/x86/subpixel_ssse3.asm @@ -0,0 +1,957 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%define BLOCK_HEIGHT_WIDTH 4 +%define VP8_FILTER_WEIGHT 128 +%define VP8_FILTER_SHIFT 7 + + +;/************************************************************************************ +; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The +; input pixel array has output_height rows. This routine assumes that output_height is an +; even number. This function handles 8 pixels in horizontal direction, calculating ONE +; rows each iteration to take advantage of the 128 bits operations. +;*************************************************************************************/ +;void vp8_filter_block1d8_h6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; unsigned int vp8_filter_index +;) +global sym(vp8_filter_block1d8_h6_ssse3) +sym(vp8_filter_block1d8_h6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 + + movdqa xmm7, [rd GLOBAL] + + lea rax, [k0_k5 GLOBAL] + add rax, rdx + mov rdi, arg(2) ;output_ptr + + cmp esi, DWORD PTR [rax] + je vp8_filter_block1d8_h4_ssse3 + + movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rcx, dword ptr arg(4) ;output_height + + movsxd rdx, dword ptr arg(3) ;output_pitch + + sub rdi, rdx +;xmm3 free +filter_block1d8_h6_rowloop_ssse3: + movdqu xmm0, XMMWORD PTR [rsi - 2] + + movdqa xmm1, xmm0 + pshufb xmm0, [shuf1b GLOBAL] + + movdqa xmm2, xmm1 + pshufb xmm1, [shuf2b GLOBAL] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm5 + + pshufb xmm2, [shuf3b GLOBAL] + add rdi, rdx + pmaddubsw xmm2, xmm6 + + lea rsi, [rsi + rax] + dec rcx + paddsw xmm0, xmm1 + paddsw xmm0, xmm7 + paddsw xmm0, xmm2 + psraw xmm0, 7 + packuswb xmm0, xmm0 + + movq MMWORD Ptr [rdi], xmm0 + jnz filter_block1d8_h6_rowloop_ssse3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +vp8_filter_block1d8_h4_ssse3: + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 + + movdqa xmm3, XMMWORD PTR [shuf2b GLOBAL] + movdqa xmm4, XMMWORD PTR [shuf3b GLOBAL] + + mov rsi, arg(0) ;src_ptr + + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rcx, dword ptr arg(4) ;output_height + + movsxd rdx, dword ptr arg(3) ;output_pitch + + sub rdi, rdx +;xmm3 free +filter_block1d8_h4_rowloop_ssse3: + movdqu xmm0, XMMWORD PTR [rsi - 2] + + movdqa xmm2, xmm0 + pshufb xmm0, xmm3 ;[shuf2b GLOBAL] + pshufb xmm2, xmm4 ;[shuf3b GLOBAL] + + pmaddubsw xmm0, xmm5 + add rdi, rdx + pmaddubsw xmm2, xmm6 + + lea rsi, [rsi + rax] + dec rcx + paddsw xmm0, xmm7 + paddsw xmm0, xmm2 + psraw xmm0, 7 + packuswb xmm0, xmm0 + + movq MMWORD Ptr [rdi], xmm0 + + jnz filter_block1d8_h4_rowloop_ssse3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret +;void vp8_filter_block1d16_h6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; unsigned int vp8_filter_index +;) +global sym(vp8_filter_block1d16_h6_ssse3) +sym(vp8_filter_block1d16_h6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 ; + + lea rax, [k0_k5 GLOBAL] + add rax, rdx + + mov rdi, arg(2) ;output_ptr + movdqa xmm7, [rd GLOBAL] + +;; +;; cmp esi, DWORD PTR [rax] +;; je vp8_filter_block1d16_h4_ssse3 + + mov rsi, arg(0) ;src_ptr + + movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 + + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rdx, dword ptr arg(3) ;output_pitch + +filter_block1d16_h6_rowloop_ssse3: + movdqu xmm0, XMMWORD PTR [rsi - 2] + + movdqa xmm1, xmm0 + pshufb xmm0, [shuf1b GLOBAL] + movdqa xmm2, xmm1 + pmaddubsw xmm0, xmm4 + pshufb xmm1, [shuf2b GLOBAL] + pshufb xmm2, [shuf3b GLOBAL] + pmaddubsw xmm1, xmm5 + + movdqu xmm3, XMMWORD PTR [rsi + 6] + + pmaddubsw xmm2, xmm6 + paddsw xmm0, xmm1 + movdqa xmm1, xmm3 + pshufb xmm3, [shuf1b GLOBAL] + paddsw xmm0, xmm7 + pmaddubsw xmm3, xmm4 + paddsw xmm0, xmm2 + movdqa xmm2, xmm1 + pshufb xmm1, [shuf2b GLOBAL] + pshufb xmm2, [shuf3b GLOBAL] + pmaddubsw xmm1, xmm5 + pmaddubsw xmm2, xmm6 + + psraw xmm0, 7 + packuswb xmm0, xmm0 + lea rsi, [rsi + rax] + paddsw xmm3, xmm1 + paddsw xmm3, xmm7 + paddsw xmm3, xmm2 + psraw xmm3, 7 + packuswb xmm3, xmm3 + + punpcklqdq xmm0, xmm3 + + movdqa XMMWORD Ptr [rdi], xmm0 + + add rdi, rdx + dec rcx + jnz filter_block1d16_h6_rowloop_ssse3 + + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +vp8_filter_block1d16_h4_ssse3: + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rdx, dword ptr arg(3) ;output_pitch + +filter_block1d16_h4_rowloop_ssse3: + movdqu xmm1, XMMWORD PTR [rsi - 2] + + movdqa xmm2, xmm1 + pshufb xmm1, [shuf2b GLOBAL] + pshufb xmm2, [shuf3b GLOBAL] + pmaddubsw xmm1, xmm5 + + movdqu xmm3, XMMWORD PTR [rsi + 6] + + pmaddubsw xmm2, xmm6 + movdqa xmm0, xmm3 + pshufb xmm3, [shuf3b GLOBAL] + pshufb xmm0, [shuf2b GLOBAL] + + paddsw xmm1, xmm7 + paddsw xmm1, xmm2 + + pmaddubsw xmm0, xmm5 + pmaddubsw xmm3, xmm6 + + psraw xmm1, 7 + packuswb xmm1, xmm1 + lea rsi, [rsi + rax] + paddsw xmm3, xmm0 + paddsw xmm3, xmm7 + psraw xmm3, 7 + packuswb xmm3, xmm3 + + punpcklqdq xmm1, xmm3 + + movdqa XMMWORD Ptr [rdi], xmm1 + + add rdi, rdx + dec rcx + jnz filter_block1d16_h4_rowloop_ssse3 + + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_filter_block1d4_h6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; unsigned int vp8_filter_index +;) +global sym(vp8_filter_block1d4_h6_ssse3) +sym(vp8_filter_block1d4_h6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + mov rsi, arg(0) ;src_ptr + shl rdx, 4 ; + + lea rax, [k0_k5 GLOBAL] + add rax, rdx + movdqa xmm7, [rd GLOBAL] + + + + + + movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 + + mov rdi, arg(2) ;output_ptr + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rcx, dword ptr arg(4) ;output_height + + movsxd rdx, dword ptr arg(3) ;output_pitch + +;xmm3 free +filter_block1d4_h6_rowloop_ssse3: + movdqu xmm0, XMMWORD PTR [rsi - 2] + + movdqa xmm1, xmm0 + pshufb xmm0, [shuf1b GLOBAL] + + movdqa xmm2, xmm1 + pshufb xmm1, [shuf2b GLOBAL] + pmaddubsw xmm0, xmm4 + pshufb xmm2, [shuf3b GLOBAL] + pmaddubsw xmm1, xmm5 + +;-- + pmaddubsw xmm2, xmm6 + + lea rsi, [rsi + rax] +;-- + paddsw xmm0, xmm1 + paddsw xmm0, xmm7 + pxor xmm1, xmm1 + paddsw xmm0, xmm2 + psraw xmm0, 7 + packuswb xmm0, xmm0 + +; + punpcklbw xmm0, xmm1 + + movq MMWORD PTR [rdi], xmm0 + add rdi, rdx + dec rcx + jnz filter_block1d4_h6_rowloop_ssse3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_filter_block1d16_v6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; unsigned int vp8_filter_index +;) +global sym(vp8_filter_block1d16_v6_ssse3) +sym(vp8_filter_block1d16_v6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 ; + + lea rax, [k0_k5 GLOBAL] + add rax, rdx + + cmp esi, DWORD PTR [rax] + je vp8_filter_block1d16_v4_ssse3 + + movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 + movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + mov rdi, arg(2) ;output_ptr + +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ;out_pitch +%endif + mov rax, rsi + movsxd rcx, DWORD PTR arg(4) ;output_height + add rax, rdx + + +vp8_filter_block1d16_v6_ssse3_loop: + movq xmm1, MMWORD PTR [rsi] ;A + movq xmm2, MMWORD PTR [rsi + rdx] ;B + movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E + + punpcklbw xmm2, xmm4 ;B D + punpcklbw xmm3, xmm0 ;C E + + movq xmm0, MMWORD PTR [rax + rdx * 4] ;F + + pmaddubsw xmm3, xmm6 + punpcklbw xmm1, xmm0 ;A F + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm5 + + paddsw xmm2, xmm3 + paddsw xmm2, xmm1 + paddsw xmm2, [rd GLOBAL] + psraw xmm2, 7 + packuswb xmm2, xmm2 + + movq MMWORD PTR [rdi], xmm2 ;store the results + + movq xmm1, MMWORD PTR [rsi + 8] ;A + movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B + movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E + + punpcklbw xmm2, xmm4 ;B D + punpcklbw xmm3, xmm0 ;C E + + movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F + pmaddubsw xmm3, xmm6 + punpcklbw xmm1, xmm0 ;A F + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm5 + + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw xmm2, xmm3 + paddsw xmm2, xmm1 + paddsw xmm2, [rd GLOBAL] + psraw xmm2, 7 + packuswb xmm2, xmm2 + + movq MMWORD PTR [rdi+8], xmm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;out_pitch +%else + add rdi, r8 +%endif + dec rcx + jnz vp8_filter_block1d16_v6_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +vp8_filter_block1d16_v4_ssse3: + movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + mov rdi, arg(2) ;output_ptr + +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ;out_pitch +%endif + mov rax, rsi + movsxd rcx, DWORD PTR arg(4) ;output_height + add rax, rdx + +vp8_filter_block1d16_v4_ssse3_loop: + movq xmm2, MMWORD PTR [rsi + rdx] ;B + movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E + + punpcklbw xmm2, xmm4 ;B D + punpcklbw xmm3, xmm0 ;C E + + pmaddubsw xmm3, xmm6 + pmaddubsw xmm2, xmm7 + movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B + movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E + + paddsw xmm2, [rd GLOBAL] + paddsw xmm2, xmm3 + psraw xmm2, 7 + packuswb xmm2, xmm2 + + punpcklbw xmm5, xmm4 ;B D + punpcklbw xmm1, xmm0 ;C E + + pmaddubsw xmm1, xmm6 + pmaddubsw xmm5, xmm7 + + movdqa xmm4, [rd GLOBAL] + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw xmm5, xmm1 + paddsw xmm5, xmm4 + psraw xmm5, 7 + packuswb xmm5, xmm5 + + punpcklqdq xmm2, xmm5 + + movdqa XMMWORD PTR [rdi], xmm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;out_pitch +%else + add rdi, r8 +%endif + dec rcx + jnz vp8_filter_block1d16_v4_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_filter_block1d8_v6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; unsigned int vp8_filter_index +;) +global sym(vp8_filter_block1d8_v6_ssse3) +sym(vp8_filter_block1d8_v6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 ; + + lea rax, [k0_k5 GLOBAL] + add rax, rdx + + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + mov rdi, arg(2) ;output_ptr +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ; out_pitch +%endif + movsxd rcx, DWORD PTR arg(4) ;[output_height] + + cmp esi, DWORD PTR [rax] + je vp8_filter_block1d8_v4_ssse3 + + movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 + movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + + mov rax, rsi + add rax, rdx + +vp8_filter_block1d8_v6_ssse3_loop: + movq xmm1, MMWORD PTR [rsi] ;A + movq xmm2, MMWORD PTR [rsi + rdx] ;B + movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E + + punpcklbw xmm2, xmm4 ;B D + punpcklbw xmm3, xmm0 ;C E + + movq xmm0, MMWORD PTR [rax + rdx * 4] ;F + movdqa xmm4, [rd GLOBAL] + + pmaddubsw xmm3, xmm6 + punpcklbw xmm1, xmm0 ;A F + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm5 + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw xmm2, xmm3 + paddsw xmm2, xmm1 + paddsw xmm2, xmm4 + psraw xmm2, 7 + packuswb xmm2, xmm2 + + movq MMWORD PTR [rdi], xmm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;[out_pitch] +%else + add rdi, r8 +%endif + dec rcx + jnz vp8_filter_block1d8_v6_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +vp8_filter_block1d8_v4_ssse3: + movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 + movdqa xmm5, [rd GLOBAL] + + mov rsi, arg(0) ;src_ptr + + mov rax, rsi + add rax, rdx + +vp8_filter_block1d8_v4_ssse3_loop: + movq xmm2, MMWORD PTR [rsi + rdx] ;B + movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E + + punpcklbw xmm2, xmm4 ;B D + punpcklbw xmm3, xmm0 ;C E + + pmaddubsw xmm3, xmm6 + pmaddubsw xmm2, xmm7 + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw xmm2, xmm3 + paddsw xmm2, xmm5 + psraw xmm2, 7 + packuswb xmm2, xmm2 + + movq MMWORD PTR [rdi], xmm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;[out_pitch] +%else + add rdi, r8 +%endif + dec rcx + jnz vp8_filter_block1d8_v4_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +global sym(vp8_filter_block1d8_h6_ssse3_slow) +sym(vp8_filter_block1d8_h6_ssse3_slow): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(6) ;vp8_filter + mov rsi, arg(0) ;src_ptr + + mov rdi, arg(1) ;output_ptr + + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(2) ;src_pixels_per_line + + movq xmm7, [rdx] + pxor xmm4, xmm4 + movdqa xmm5, XMMWORD PTR [shuf1 GLOBAL] + movdqa xmm6, XMMWORD PTR [shuf2 GLOBAL] + + movsxd rdx, dword ptr arg(5) ;output_width + + punpcklqdq xmm7, xmm7 ;copy filter constants to upper 8 bytes + +filter_block1d8_h6_rowloop3_slow: + movdqu xmm0, XMMWORD PTR [rsi - 2] + + lea rsi, [rsi + rax] + + movdqa xmm1, xmm0 + pshufb xmm0, XMMWORD PTR [shuf1 GLOBAL] + + movdqa xmm2, xmm1 + pmaddubsw xmm0, xmm7 + pshufb xmm1, XMMWORD PTR [shuf2 GLOBAL] + + movdqa xmm3, xmm2 + pmaddubsw xmm1, xmm7 + pshufb xmm2, XMMWORD PTR [shuf3 GLOBAL] + + pshufb xmm3, XMMWORD PTR [shuf4 GLOBAL] + + pmaddubsw xmm2, xmm7 + pmaddubsw xmm3, xmm7 +;4 cycles + + phaddsw xmm0, xmm1 + phaddsw xmm2, xmm3 +;7 cycles + phaddsw xmm0, xmm2 +;7 cycles + + + paddsw xmm0, [rd GLOBAL] + psraw xmm0, 7 + packuswb xmm0, xmm0 + +; + punpcklbw xmm0, xmm4 + + movdqa XMMWORD Ptr [rdi], xmm0 + add rdi, rdx + dec rcx + jnz filter_block1d8_h6_rowloop3_slow ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret +;void vp8_filter_block1d16_h6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned short *output_ptr, +; unsigned int src_pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short *vp8_filter +;) +global sym(vp8_filter_block1d16_h6_ssse3_slow) +sym(vp8_filter_block1d16_h6_ssse3_slow): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM + GET_GOT rbx + push rsi + push rdi + ; end prolog + mov rdx, arg(6) ;vp8_filter + mov rsi, arg(0) ;src_ptr + + mov rdi, arg(1) ;output_ptr + + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(2) ;src_pixels_per_line + + movq xmm7, [rdx] + pxor xmm4, xmm4 + movdqa xmm5, XMMWORD PTR [shuf1 GLOBAL] + movdqa xmm6, XMMWORD PTR [shuf2 GLOBAL] + + movsxd rdx, dword ptr arg(5) ;output_width + + punpcklqdq xmm7, xmm7 ;copy filter constants to upper 8 bytes + sub rdi, rdx + +filter_block1d16_h6_rowloop3_slow: + movdqu xmm0, XMMWORD PTR [rsi - 2] + + movdqa xmm1, xmm0 + pshufb xmm0, xmm5 + + movdqa xmm2, xmm1 + pmaddubsw xmm0, xmm7 + pshufb xmm1, xmm6 + + movdqa xmm3, xmm2 + pmaddubsw xmm1, xmm7 + pshufb xmm2, XMMWORD PTR [shuf3 GLOBAL] + movdqu xmm4, XMMWORD PTR [rsi + 6] + pshufb xmm3, XMMWORD PTR [shuf4 GLOBAL] + lea rsi, [rsi + rax] + pmaddubsw xmm2, xmm7 + phaddsw xmm0, xmm1 + + pmaddubsw xmm3, xmm7 + movdqa xmm1, xmm4 + pshufb xmm4, xmm5 + movdqa xmm5, xmm1 + pmaddubsw xmm4, xmm7 + pshufb xmm1, xmm6 + phaddsw xmm2, xmm3 + pmaddubsw xmm1, xmm7 + movdqa xmm3, xmm5 + pshufb xmm5, XMMWORD PTR [shuf3 GLOBAL] + add rdi, rdx + pmaddubsw xmm5, xmm7 + pshufb xmm3, XMMWORD PTR [shuf4 GLOBAL] + phaddsw xmm4, xmm1 + dec rcx + phaddsw xmm0, xmm2 + pmaddubsw xmm3, xmm7 + + + paddsw xmm0, [rd GLOBAL] + psraw xmm0, 7 + packuswb xmm0, xmm0 + phaddsw xmm5, xmm3 + pxor xmm3, xmm3 + punpcklbw xmm0, xmm3 +;-- +;-- +;-- +;-- + + phaddsw xmm4, xmm5 + movdqa xmm5, XMMWORD PTR [shuf1 GLOBAL] + movdqa XMMWORD Ptr [rdi], xmm0 +;-- +;-- +;-- +;-- +;-- + paddsw xmm4, [rd GLOBAL] + psraw xmm4, 7 + packuswb xmm4, xmm4 +; + punpcklbw xmm4, xmm3 + + movdqa XMMWORD Ptr [rdi+16], xmm4 + + jnz filter_block1d16_h6_rowloop3_slow ; next row + + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +shuf1: + db 0, 1, 2, 4, 3, 5, 128, 128, 1, 2, 3, 5, 4, 6, 128, 128 +shuf2: + db 2, 3, 4, 6, 5, 7, 128, 128, 3, 4, 5, 7, 6, 8, 128, 128 +shuf3: + db 4, 5, 6, 8, 7, 9, 128, 128, 5, 6, 7, 9, 8, 10, 128, 128 +shuf4: + db 6, 7, 8, 10, 9, 11, 128, 128, 7, 8, 9, 11, 10, 12, 128, 128 + +shuf1a: + db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +shuf2a: + db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11 +shuf3a: + db 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12 + +shuf1b: + db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 +shuf2b: + db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11 +shuf3b: + db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10 + +align 16 +rd: + times 8 dw 0x40 + +align 16 +k0_k5: + times 8 db 0, 0 ;placeholder + times 8 db 0, 0 + times 8 db 2, 1 + times 8 db 0, 0 + times 8 db 3, 3 + times 8 db 0, 0 + times 8 db 1, 2 + times 8 db 0, 0 +k1_k3: + times 8 db 0, 0 ;placeholder + times 8 db -6, 12 + times 8 db -11, 36 + times 8 db -9, 50 + times 8 db -16, 77 + times 8 db -6, 93 + times 8 db -8, 108 + times 8 db -1, 123 +k2_k4: + times 8 db 128, 0 ;placeholder + times 8 db 123, -1 + times 8 db 108, -8 + times 8 db 93, -6 + times 8 db 77, -16 + times 8 db 50, -9 + times 8 db 36, -11 + times 8 db 12, -6 + diff --git a/vp8/common/x86/subpixel_x86.h b/vp8/common/x86/subpixel_x86.h index c406be7..e5c08b1 100644 --- a/vp8/common/x86/subpixel_x86.h +++ b/vp8/common/x86/subpixel_x86.h @@ -86,4 +86,37 @@ extern prototype_subpixel_predict(vp8_bilinear_predict8x8_sse2); #endif #endif +#if HAVE_SSSE3 +extern prototype_subpixel_predict(vp8_sixtap_predict16x16_ssse3); +extern prototype_subpixel_predict(vp8_sixtap_predict8x8_ssse3); +extern prototype_subpixel_predict(vp8_sixtap_predict8x4_ssse3); +extern prototype_subpixel_predict(vp8_sixtap_predict4x4_ssse3); +//extern prototype_subpixel_predict(vp8_bilinear_predict16x16_sse2); +//extern prototype_subpixel_predict(vp8_bilinear_predict8x8_sse2); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_subpix_sixtap16x16 +#define vp8_subpix_sixtap16x16 vp8_sixtap_predict16x16_ssse3 + +#undef vp8_subpix_sixtap8x8 +#define vp8_subpix_sixtap8x8 vp8_sixtap_predict8x8_ssse3 + +#undef vp8_subpix_sixtap8x4 +#define vp8_subpix_sixtap8x4 vp8_sixtap_predict8x4_ssse3 + +//#undef vp8_subpix_sixtap4x4 +//#define vp8_subpix_sixtap4x4 vp8_sixtap_predict4x4_ssse3 + + +//#undef vp8_subpix_bilinear16x16 +//#define vp8_subpix_bilinear16x16 vp8_bilinear_predict16x16_sse2 + +//#undef vp8_subpix_bilinear8x8 +//#define vp8_subpix_bilinear8x8 vp8_bilinear_predict8x8_sse2 + +#endif +#endif + + + #endif diff --git a/vp8/common/x86/vp8_asm_stubs.c b/vp8/common/x86/vp8_asm_stubs.c index 79650cf..2c99cb6 100644 --- a/vp8/common/x86/vp8_asm_stubs.c +++ b/vp8/common/x86/vp8_asm_stubs.c @@ -359,3 +359,172 @@ void vp8_sixtap_predict8x4_sse2 } #endif + +#if HAVE_SSSE3 + +extern void vp8_filter_block1d8_h6_ssse3 +( + unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int output_height, + unsigned int vp8_filter_index +); + +extern void vp8_filter_block1d16_h6_ssse3 +( + unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int output_height, + unsigned int vp8_filter_index +); + +extern void vp8_filter_block1d16_v6_ssse3 +( + unsigned char *src_ptr, + unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + unsigned int vp8_filter_index +); + +extern void vp8_filter_block1d8_v6_ssse3 +( + unsigned char *src_ptr, + unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + unsigned int vp8_filter_index +); + +void vp8_sixtap_predict16x16_ssse3 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch + +) +{ + DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 24*24); + + if (xoffset) + { + if (yoffset) + { + vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 16, 21, xoffset); + vp8_filter_block1d16_v6_ssse3(FData2 , 16, dst_ptr, dst_pitch, 16, yoffset); + } + else + { + // First-pass only + vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, xoffset); + } + } + else + { + // Second-pass only + vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line) , src_pixels_per_line, dst_ptr, dst_pitch, 16, yoffset); + } +} + +void vp8_sixtap_predict8x8_ssse3 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256); + + if (xoffset) + { + if (yoffset) + { + vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 8, 13, xoffset); + vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 8, yoffset); + } + else + { + vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, xoffset); + } + } + else + { + // Second-pass only + vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, yoffset); + } +} + + +void vp8_sixtap_predict8x4_ssse3 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256); + + if (xoffset) + { + if (yoffset) + { + vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 8, 9, xoffset); + vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 4, yoffset); + } + else + { + // First-pass only + vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, xoffset); + } + } + else + { + // Second-pass only + vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, yoffset); + } +} + +void vp8_sixtap_predict4x4_ssse3 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 16*16); + + if (xoffset) + { + if (yoffset) + { + + } + else + { + } + } + else + { + } + +} + +#endif diff --git a/vp8/common/x86/x86_systemdependent.c b/vp8/common/x86/x86_systemdependent.c index 66738f8..2d8ced0 100644 --- a/vp8/common/x86/x86_systemdependent.c +++ b/vp8/common/x86/x86_systemdependent.c @@ -27,6 +27,7 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx) int mmx_enabled = flags & HAS_MMX; int xmm_enabled = flags & HAS_SSE; int wmt_enabled = flags & HAS_SSE2; + int SSSE3Enabled = flags & HAS_SSSE3; /* Note: * @@ -114,5 +115,17 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx) } #endif + +#if HAVE_SSSE3 + + if (SSSE3Enabled) + { + rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_ssse3; + rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_ssse3; + rtcd->subpix.sixtap8x4 = vp8_sixtap_predict8x4_ssse3; +// rtcd->subpix.sixtap4x4 = vp8_sixtap_predict4x4_ssse3; + } +#endif + #endif } diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index 5b8a301..a8a252a 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -109,6 +109,7 @@ VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm +VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm ifeq ($(CONFIG_POSTPROC),yes) VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm -- 2.7.4