unsigned int output_height,
const short *filter);
+void vp9_filter_block1d16_v8_avg_ssse3(const unsigned char *src_ptr,
+ const unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ const short *filter);
+
+void vp9_filter_block1d16_h8_avg_ssse3(const unsigned char *src_ptr,
+ const unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ const short *filter);
+
+void vp9_filter_block1d8_v8_avg_ssse3(const unsigned char *src_ptr,
+ const unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ const short *filter);
+
+void vp9_filter_block1d8_h8_avg_ssse3(const unsigned char *src_ptr,
+ const unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ const short *filter);
+
+void vp9_filter_block1d4_v8_avg_ssse3(const unsigned char *src_ptr,
+ const unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ const short *filter);
+
+void vp9_filter_block1d4_h8_avg_ssse3(const unsigned char *src_ptr,
+ const unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ const short *filter);
+
void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
const int16_t *filter_x, int x_step_q4,
}
}
+void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (x_step_q4 == 16 && filter_x[3] != 128) {
+ while (w >= 16) {
+ vp9_filter_block1d16_h8_avg_ssse3(src, src_stride,
+ dst, dst_stride,
+ h, filter_x);
+ src += 16;
+ dst += 16;
+ w -= 16;
+ }
+ while (w >= 8) {
+ vp9_filter_block1d8_h8_avg_ssse3(src, src_stride,
+ dst, dst_stride,
+ h, filter_x);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ }
+ while (w >= 4) {
+ vp9_filter_block1d4_h8_avg_ssse3(src, src_stride,
+ dst, dst_stride,
+ h, filter_x);
+ src += 4;
+ dst += 4;
+ w -= 4;
+ }
+ }
+ if (w) {
+ vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ }
+}
+
+void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (y_step_q4 == 16 && filter_y[3] != 128) {
+ while (w >= 16) {
+ vp9_filter_block1d16_v8_avg_ssse3(src - src_stride * 3, src_stride,
+ dst, dst_stride,
+ h, filter_y);
+ src += 16;
+ dst += 16;
+ w -= 16;
+ }
+ while (w >= 8) {
+ vp9_filter_block1d8_v8_avg_ssse3(src - src_stride * 3, src_stride,
+ dst, dst_stride,
+ h, filter_y);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ }
+ while (w >= 4) {
+ vp9_filter_block1d4_v8_avg_ssse3(src - src_stride * 3, src_stride,
+ dst, dst_stride,
+ h, filter_y);
+ src += 4;
+ dst += 4;
+ w -= 4;
+ }
+ }
+ if (w) {
+ vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ }
+}
+
void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
const int16_t *filter_x, int x_step_q4,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
}
+
+void vp9_convolve8_avg_ssse3(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*23);
+
+ // check w/h due to fixed size fdata2 array
+ assert(w <= 16);
+ assert(h <= 16);
+
+ if (x_step_q4 == 16 && y_step_q4 == 16 &&
+ filter_x[3] != 128 && filter_y[3] != 128) {
+ if (w == 16) {
+ vp9_filter_block1d16_h8_ssse3(src - 3 * src_stride, src_stride,
+ fdata2, 16,
+ h + 7, filter_x);
+ vp9_filter_block1d16_v8_avg_ssse3(fdata2, 16,
+ dst, dst_stride,
+ h, filter_y);
+ return;
+ }
+ if (w == 8) {
+ vp9_filter_block1d8_h8_ssse3(src - 3 * src_stride, src_stride,
+ fdata2, 16,
+ h + 7, filter_x);
+ vp9_filter_block1d8_v8_avg_ssse3(fdata2, 16,
+ dst, dst_stride,
+ h, filter_y);
+ return;
+ }
+ if (w == 4) {
+ vp9_filter_block1d4_h8_ssse3(src - 3 * src_stride, src_stride,
+ fdata2, 16,
+ h + 7, filter_x);
+ vp9_filter_block1d4_v8_avg_ssse3(fdata2, 16,
+ dst, dst_stride,
+ h, filter_y);
+ return;
+ }
+ }
+ vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+}
#endif
;
;*************************************************************************************/
-;void vp9_filter_block1d8_v8_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE
-sym(vp9_filter_block1d4_v8_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16*5
- %define k0k1 [rsp + 16*0]
- %define k2k3 [rsp + 16*1]
- %define k4k5 [rsp + 16*2]
- %define k6k7 [rsp + 16*3]
- %define krd [rsp + 16*4]
+%macro VERTx4 1
mov rdx, arg(5) ;filter ptr
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;output_ptr
lea rbx, [rdx + rdx*4]
add rbx, rdx ;pitch * 6
-.vp9_filter_block1d4_v8_ssse3_loop:
+.loop:
movd xmm0, [rsi] ;A
movd xmm1, [rsi + rdx] ;B
movd xmm2, [rsi + rdx * 2] ;C
add rsi, rdx
add rax, rdx
-
+%if %1
+ movd xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
movd [rdi], xmm0
%if ABI_IS_32BIT
add rdi, r8
%endif
dec rcx
- jnz .vp9_filter_block1d4_v8_ssse3_loop
-
- add rsp, 16*5
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_filter_block1d8_v8_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE
-sym(vp9_filter_block1d8_v8_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16*5
- %define k0k1 [rsp + 16*0]
- %define k2k3 [rsp + 16*1]
- %define k4k5 [rsp + 16*2]
- %define k6k7 [rsp + 16*3]
- %define krd [rsp + 16*4]
+ jnz .loop
+%endm
+%macro VERTx8 1
mov rdx, arg(5) ;filter ptr
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;output_ptr
lea rbx, [rdx + rdx*4]
add rbx, rdx ;pitch * 6
-.vp9_filter_block1d8_v8_ssse3_loop:
+.loop:
movq xmm0, [rsi] ;A
movq xmm1, [rsi + rdx] ;B
movq xmm2, [rsi + rdx * 2] ;C
add rsi, rdx
add rax, rdx
-
+%if %1
+ movq xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
movq [rdi], xmm0
%if ABI_IS_32BIT
add rdi, r8
%endif
dec rcx
- jnz .vp9_filter_block1d8_v8_ssse3_loop
-
- add rsp, 16*5
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_filter_block1d16_v8_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE
-sym(vp9_filter_block1d16_v8_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
+ jnz .loop
+%endm
- ALIGN_STACK 16, rax
- sub rsp, 16*5
- %define k0k1 [rsp + 16*0]
- %define k2k3 [rsp + 16*1]
- %define k4k5 [rsp + 16*2]
- %define k6k7 [rsp + 16*3]
- %define krd [rsp + 16*4]
+%macro VERTx16 1
mov rdx, arg(5) ;filter ptr
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;output_ptr
lea rbx, [rdx + rdx*4]
add rbx, rdx ;pitch * 6
-.vp9_filter_block1d16_v8_ssse3_loop:
+.loop:
movq xmm0, [rsi] ;A
movq xmm1, [rsi + rdx] ;B
movq xmm2, [rsi + rdx * 2] ;C
psraw xmm0, 7
packuswb xmm0, xmm0
-
+%if %1
+ movq xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
movq [rdi], xmm0
movq xmm0, [rsi + 8] ;A
add rsi, rdx
add rax, rdx
+%if %1
+ movq xmm1, [rdi+8]
+ pavgb xmm0, xmm1
+%endif
movq [rdi+8], xmm0
add rdi, r8
%endif
dec rcx
- jnz .vp9_filter_block1d16_v8_ssse3_loop
+ jnz .loop
+%endm
+
+;void vp9_filter_block1d8_v8_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE
+sym(vp9_filter_block1d4_v8_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ VERTx4 0
add rsp, 16*5
pop rsp
pop rbp
ret
-;void vp9_filter_block1d4_h8_ssse3
+;void vp9_filter_block1d8_v8_ssse3
;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
; short *filter
;)
-global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE
-sym(vp9_filter_block1d4_h8_ssse3):
+global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE
+sym(vp9_filter_block1d8_v8_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
- GET_GOT rbx
push rsi
push rdi
+ push rbx
; end prolog
ALIGN_STACK 16, rax
%define k6k7 [rsp + 16*3]
%define krd [rsp + 16*4]
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x0400040
-
- movdqa xmm4, [rdx] ;load filters
- movd xmm5, rcx
- packsswb xmm4, xmm4
- pshuflw xmm0, xmm4, 0b ;k0_k1
- pshuflw xmm1, xmm4, 01010101b ;k2_k3
- pshuflw xmm2, xmm4, 10101010b ;k4_k5
- pshuflw xmm3, xmm4, 11111111b ;k6_k7
-
- punpcklqdq xmm0, xmm0
- punpcklqdq xmm1, xmm1
- punpcklqdq xmm2, xmm2
- punpcklqdq xmm3, xmm3
-
- movdqa k0k1, xmm0
- movdqa k2k3, xmm1
- pshufd xmm5, xmm5, 0
- movdqa k4k5, xmm2
- movdqa k6k7, xmm3
- movdqa krd, xmm5
-
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rdx, dword ptr arg(3) ;output_pitch
- movsxd rcx, dword ptr arg(4) ;output_height
-
-.filter_block1d4_h8_rowloop_ssse3:
- movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4
-
- movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12
- punpcklqdq xmm0, xmm3
-
- movdqa xmm1, xmm0
- pshufb xmm0, [GLOBAL(shuf_t0t1)]
- pmaddubsw xmm0, k0k1
-
- movdqa xmm2, xmm1
- pshufb xmm1, [GLOBAL(shuf_t2t3)]
- pmaddubsw xmm1, k2k3
-
- movdqa xmm4, xmm2
- pshufb xmm2, [GLOBAL(shuf_t4t5)]
- pmaddubsw xmm2, k4k5
-
- pshufb xmm4, [GLOBAL(shuf_t6t7)]
- pmaddubsw xmm4, k6k7
-
- paddsw xmm0, xmm1
- paddsw xmm0, xmm4
- paddsw xmm0, xmm2
- paddsw xmm0, krd
- psraw xmm0, 7
- packuswb xmm0, xmm0
-
- lea rsi, [rsi + rax]
- movd [rdi], xmm0
-
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .filter_block1d4_h8_rowloop_ssse3
+ VERTx8 0
add rsp, 16*5
pop rsp
-
+ pop rbx
; begin epilog
pop rdi
pop rsi
- RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
-;void vp9_filter_block1d8_h8_ssse3
+;void vp9_filter_block1d16_v8_ssse3
;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
; short *filter
;)
-global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE
-sym(vp9_filter_block1d8_h8_ssse3):
+global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE
+sym(vp9_filter_block1d16_v8_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
- GET_GOT rbx
push rsi
push rdi
+ push rbx
; end prolog
ALIGN_STACK 16, rax
%define k6k7 [rsp + 16*3]
%define krd [rsp + 16*4]
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x0400040
-
- movdqa xmm4, [rdx] ;load filters
- movd xmm5, rcx
- packsswb xmm4, xmm4
- pshuflw xmm0, xmm4, 0b ;k0_k1
- pshuflw xmm1, xmm4, 01010101b ;k2_k3
- pshuflw xmm2, xmm4, 10101010b ;k4_k5
- pshuflw xmm3, xmm4, 11111111b ;k6_k7
+ VERTx16 0
- punpcklqdq xmm0, xmm0
- punpcklqdq xmm1, xmm1
- punpcklqdq xmm2, xmm2
- punpcklqdq xmm3, xmm3
+ add rsp, 16*5
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d4_v8_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ VERTx4 1
+
+ add rsp, 16*5
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d8_v8_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ VERTx8 1
+
+ add rsp, 16*5
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d16_v8_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ VERTx16 1
+
+ add rsp, 16*5
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+%macro HORIZx4 1
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm4, [rdx] ;load filters
+ movd xmm5, rcx
+ packsswb xmm4, xmm4
+ pshuflw xmm0, xmm4, 0b ;k0_k1
+ pshuflw xmm1, xmm4, 01010101b ;k2_k3
+ pshuflw xmm2, xmm4, 10101010b ;k4_k5
+ pshuflw xmm3, xmm4, 11111111b ;k6_k7
+
+ punpcklqdq xmm0, xmm0
+ punpcklqdq xmm1, xmm1
+ punpcklqdq xmm2, xmm2
+ punpcklqdq xmm3, xmm3
movdqa k0k1, xmm0
movdqa k2k3, xmm1
movsxd rdx, dword ptr arg(3) ;output_pitch
movsxd rcx, dword ptr arg(4) ;output_height
-.filter_block1d8_h8_rowloop_ssse3:
+.loop:
movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4
movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12
paddsw xmm0, krd
psraw xmm0, 7
packuswb xmm0, xmm0
-
+%if %1
+ movd xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
lea rsi, [rsi + rax]
- movq [rdi], xmm0
+ movd [rdi], xmm0
lea rdi, [rdi + rdx]
dec rcx
- jnz .filter_block1d8_h8_rowloop_ssse3
+ jnz .loop
+%endm
- add rsp, 16*5
- pop rsp
+%macro HORIZx8 1
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
+ movdqa xmm4, [rdx] ;load filters
+ movd xmm5, rcx
+ packsswb xmm4, xmm4
+ pshuflw xmm0, xmm4, 0b ;k0_k1
+ pshuflw xmm1, xmm4, 01010101b ;k2_k3
+ pshuflw xmm2, xmm4, 10101010b ;k4_k5
+ pshuflw xmm3, xmm4, 11111111b ;k6_k7
-;void vp9_filter_block1d16_h8_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE
-sym(vp9_filter_block1d16_h8_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
+ punpcklqdq xmm0, xmm0
+ punpcklqdq xmm1, xmm1
+ punpcklqdq xmm2, xmm2
+ punpcklqdq xmm3, xmm3
- ALIGN_STACK 16, rax
- sub rsp, 16*5
- %define k0k1 [rsp + 16*0]
- %define k2k3 [rsp + 16*1]
- %define k4k5 [rsp + 16*2]
- %define k6k7 [rsp + 16*3]
- %define krd [rsp + 16*4]
+ movdqa k0k1, xmm0
+ movdqa k2k3, xmm1
+ pshufd xmm5, xmm5, 0
+ movdqa k4k5, xmm2
+ movdqa k6k7, xmm3
+ movdqa krd, xmm5
+
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+ movsxd rcx, dword ptr arg(4) ;output_height
+.loop:
+ movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4
+
+ movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12
+ punpcklqdq xmm0, xmm3
+
+ movdqa xmm1, xmm0
+ pshufb xmm0, [GLOBAL(shuf_t0t1)]
+ pmaddubsw xmm0, k0k1
+
+ movdqa xmm2, xmm1
+ pshufb xmm1, [GLOBAL(shuf_t2t3)]
+ pmaddubsw xmm1, k2k3
+
+ movdqa xmm4, xmm2
+ pshufb xmm2, [GLOBAL(shuf_t4t5)]
+ pmaddubsw xmm2, k4k5
+
+ pshufb xmm4, [GLOBAL(shuf_t6t7)]
+ pmaddubsw xmm4, k6k7
+
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm4
+ paddsw xmm0, xmm2
+ paddsw xmm0, krd
+ psraw xmm0, 7
+ packuswb xmm0, xmm0
+%if %1
+ movq xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+
+ lea rsi, [rsi + rax]
+ movq [rdi], xmm0
+
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+%endm
+
+%macro HORIZx16 1
mov rdx, arg(5) ;filter ptr
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;output_ptr
movsxd rdx, dword ptr arg(3) ;output_pitch
movsxd rcx, dword ptr arg(4) ;output_height
-.filter_block1d16_h8_rowloop_ssse3:
+.loop:
movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4
movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12
psraw xmm3, 7
packuswb xmm3, xmm3
punpcklqdq xmm0, xmm3
+%if %1
+ movdqa xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
lea rsi, [rsi + rax]
movdqa [rdi], xmm0
lea rdi, [rdi + rdx]
dec rcx
- jnz .filter_block1d16_h8_rowloop_ssse3
+ jnz .loop
+%endm
+
+;void vp9_filter_block1d4_h8_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE
+sym(vp9_filter_block1d4_h8_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ HORIZx4 0
+
+ add rsp, 16*5
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_filter_block1d8_h8_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE
+sym(vp9_filter_block1d8_h8_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ HORIZx8 0
+
+ add rsp, 16*5
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_filter_block1d16_h8_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE
+sym(vp9_filter_block1d16_h8_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ HORIZx16 0
+
+ add rsp, 16*5
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d4_h8_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ HORIZx4 1
+
+ add rsp, 16*5
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d8_h8_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ HORIZx8 1
add rsp, 16*5
pop rsp
pop rbp
ret
+global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d16_h8_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ HORIZx16 1
+ add rsp, 16*5
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
SECTION_RODATA
align 16
shuf_t0t1: