From d1961e6fbfd5642cf2250e17e65d6499d8819e8e Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Mon, 3 Feb 2014 12:40:18 -0800 Subject: [PATCH] Optimize bilinear sub-pixel filters in ssse3 This patch added ssse3 optimization of bilinear sub-pixel filters. The real time encoder was speeded up by ~1%. Change-Id: Ie82e98976f411183cb8c61ab8d2ba0276e55a338 --- vp9/common/x86/vp9_asm_stubs.c | 57 ++-- vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm | 422 +++++++++++++++++++++++++ vp9/vp9_common.mk | 1 + 3 files changed, 451 insertions(+), 29 deletions(-) create mode 100644 vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c index 8ab5fb1..feb0098 100644 --- a/vp9/common/x86/vp9_asm_stubs.c +++ b/vp9/common/x86/vp9_asm_stubs.c @@ -23,8 +23,8 @@ typedef void filter8_1dfunction ( const short *filter ); -#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt1, opt2) \ -void vp9_convolve8_##name##_##opt1(const uint8_t *src, ptrdiff_t src_stride, \ +#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ +void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \ uint8_t *dst, ptrdiff_t dst_stride, \ const int16_t *filter_x, int x_step_q4, \ const int16_t *filter_y, int y_step_q4, \ @@ -32,7 +32,7 @@ void vp9_convolve8_##name##_##opt1(const uint8_t *src, ptrdiff_t src_stride, \ if (step_q4 == 16 && filter[3] != 128) { \ if (filter[0] || filter[1] || filter[2]) { \ while (w >= 16) { \ - vp9_filter_block1d16_##dir##8_##avg##opt1(src_start, src_stride, \ + vp9_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, \ dst, dst_stride, \ h, filter); \ src += 16; \ @@ -40,7 +40,7 @@ void vp9_convolve8_##name##_##opt1(const uint8_t *src, ptrdiff_t src_stride, \ w -= 16; \ } \ while (w >= 8) { \ - vp9_filter_block1d8_##dir##8_##avg##opt1(src_start, src_stride, \ + vp9_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, \ dst, dst_stride, \ h, filter); \ src += 8; \ @@ -48,7 +48,7 @@ void vp9_convolve8_##name##_##opt1(const uint8_t *src, ptrdiff_t src_stride, \ w -= 8; \ } \ while (w >= 4) { \ - vp9_filter_block1d4_##dir##8_##avg##opt1(src_start, src_stride, \ + vp9_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, \ dst, dst_stride, \ h, filter); \ src += 4; \ @@ -57,7 +57,7 @@ void vp9_convolve8_##name##_##opt1(const uint8_t *src, ptrdiff_t src_stride, \ } \ } else { \ while (w >= 16) { \ - vp9_filter_block1d16_##dir##2_##avg##opt2(src, src_stride, \ + vp9_filter_block1d16_##dir##2_##avg##opt(src, src_stride, \ dst, dst_stride, \ h, filter); \ src += 16; \ @@ -65,7 +65,7 @@ void vp9_convolve8_##name##_##opt1(const uint8_t *src, ptrdiff_t src_stride, \ w -= 16; \ } \ while (w >= 8) { \ - vp9_filter_block1d8_##dir##2_##avg##opt2(src, src_stride, \ + vp9_filter_block1d8_##dir##2_##avg##opt(src, src_stride, \ dst, dst_stride, \ h, filter); \ src += 8; \ @@ -73,7 +73,7 @@ void vp9_convolve8_##name##_##opt1(const uint8_t *src, ptrdiff_t src_stride, \ w -= 8; \ } \ while (w >= 4) { \ - vp9_filter_block1d4_##dir##2_##avg##opt2(src, src_stride, \ + vp9_filter_block1d4_##dir##2_##avg##opt(src, src_stride, \ dst, dst_stride, \ h, filter); \ src += 4; \ @@ -136,18 +136,18 @@ filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; -filter8_1dfunction vp9_filter_block1d16_v2_sse2; -filter8_1dfunction vp9_filter_block1d16_h2_sse2; -filter8_1dfunction vp9_filter_block1d8_v2_sse2; -filter8_1dfunction vp9_filter_block1d8_h2_sse2; -filter8_1dfunction vp9_filter_block1d4_v2_sse2; -filter8_1dfunction vp9_filter_block1d4_h2_sse2; -filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2; -filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2; -filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2; -filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2; -filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2; -filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2; +filter8_1dfunction vp9_filter_block1d16_v2_ssse3; +filter8_1dfunction vp9_filter_block1d16_h2_ssse3; +filter8_1dfunction vp9_filter_block1d8_v2_ssse3; +filter8_1dfunction vp9_filter_block1d8_h2_ssse3; +filter8_1dfunction vp9_filter_block1d4_v2_ssse3; +filter8_1dfunction vp9_filter_block1d4_h2_ssse3; +filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3; // void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, @@ -169,11 +169,11 @@ filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2; // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, // int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3, sse2); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3, sse2); -FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3, sse2); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); +FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, - ssse3, sse2); + ssse3); // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, @@ -236,11 +236,10 @@ filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2; // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, // int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2, sse2); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2, sse2); -FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2, sse2); -FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2, - sse2); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); +FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); +FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2); // void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, diff --git a/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm b/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm new file mode 100644 index 0000000..b5e18fe --- /dev/null +++ b/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm @@ -0,0 +1,422 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +%macro GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm3, [rdx] ;load filters + psrldq xmm3, 6 + packsswb xmm3, xmm3 + pshuflw xmm3, xmm3, 0b ;k3_k4 + + movq xmm2, rcx ;rounding + pshufd xmm2, xmm2, 0 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_4 1 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm3 + + paddsw xmm0, xmm2 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movd [rdi], xmm0 + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + psrldq xmm7, 6 + packsswb xmm7, xmm7 + pshuflw xmm7, xmm7, 0b ;k3_k4 + punpcklwd xmm7, xmm7 + + movq xmm6, rcx ;rounding + pshufd xmm6, xmm6, 0 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_8 1 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm7 + + paddsw xmm0, xmm6 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack back to byte + +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movq [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro APPLY_FILTER_16 1 + punpcklbw xmm0, xmm1 + punpckhbw xmm2, xmm1 + pmaddubsw xmm0, xmm7 + pmaddubsw xmm2, xmm7 + + paddsw xmm0, xmm6 ;rounding + paddsw xmm2, xmm6 + psraw xmm0, 7 ;shift + psraw xmm2, 7 + packuswb xmm0, xmm2 ;pack back to byte + +%if %1 + movdqu xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +global sym(vp9_filter_block1d4_v2_ssse3) PRIVATE +sym(vp9_filter_block1d4_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_v2_ssse3) PRIVATE +sym(vp9_filter_block1d8_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_v2_ssse3) PRIVATE +sym(vp9_filter_block1d16_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_v2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d4_v2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_v2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d8_v2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_v2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d16_v2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_h2_ssse3) PRIVATE +sym(vp9_filter_block1d4_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_h2_ssse3) PRIVATE +sym(vp9_filter_block1d8_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_h2_ssse3) PRIVATE +sym(vp9_filter_block1d16_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_h2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d4_h2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_h2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d8_h2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_h2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d16_h2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index c691411..f9ba41b 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -76,6 +76,7 @@ VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_bilinear_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm +VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_bilinear_ssse3.asm ifeq ($(CONFIG_VP9_POSTPROC),yes) VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm -- 2.7.4