From: Ronald S. Bultje Date: Wed, 10 Jul 2013 18:17:19 +0000 (-0700) Subject: Replace copy_memNxM functions with a generic copy/avg function. X-Git-Tag: v1.3.0~897 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=decead7336246aa9f17469ae784c1ba0dc24b210;p=platform%2Fupstream%2Flibvpx.git Replace copy_memNxM functions with a generic copy/avg function. Change-Id: I3ce849452ed4f08527de9565a9914d5ee36170aa --- diff --git a/test/convolve_test.cc b/test/convolve_test.cc index fd2bd36..6e5002f 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc @@ -22,8 +22,8 @@ extern "C" { } namespace { -typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h); diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c index 914afa7..6f1e418 100644 --- a/vp9/common/vp9_convolve.c +++ b/vp9/common/vp9_convolve.c @@ -38,8 +38,8 @@ */ #define ALIGN_FILTERS_256 1 -static void convolve_horiz_c(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +static void convolve_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x0, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int taps) { @@ -80,8 +80,8 @@ static void convolve_horiz_c(const uint8_t *src, int src_stride, } } -static void convolve_avg_horiz_c(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +static void convolve_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x0, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int taps) { @@ -122,8 +122,8 @@ static void convolve_avg_horiz_c(const uint8_t *src, int src_stride, } } -static void convolve_vert_c(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +static void convolve_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y0, int y_step_q4, int w, int h, int taps) { @@ -164,8 +164,8 @@ static void convolve_vert_c(const uint8_t *src, int src_stride, } } -static void convolve_avg_vert_c(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +static void convolve_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y0, int y_step_q4, int w, int h, int taps) { @@ -207,8 +207,8 @@ static void convolve_avg_vert_c(const uint8_t *src, int src_stride, } } -static void convolve_c(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +static void convolve_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int taps) { @@ -237,8 +237,8 @@ static void convolve_c(const uint8_t *src, int src_stride, w, h, taps); } -static void convolve_avg_c(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +static void convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int taps) { @@ -267,8 +267,8 @@ static void convolve_avg_c(const uint8_t *src, int src_stride, w, h, taps); } -void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { @@ -277,8 +277,8 @@ void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride, w, h, 8); } -void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { @@ -287,8 +287,8 @@ void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride, w, h, 8); } -void vp9_convolve8_vert_c(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { @@ -297,8 +297,8 @@ void vp9_convolve8_vert_c(const uint8_t *src, int src_stride, w, h, 8); } -void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { @@ -307,8 +307,8 @@ void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride, w, h, 8); } -void vp9_convolve8_c(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { @@ -317,8 +317,8 @@ void vp9_convolve8_c(const uint8_t *src, int src_stride, w, h, 8); } -void vp9_convolve8_avg_c(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { @@ -339,33 +339,25 @@ void vp9_convolve8_avg_c(const uint8_t *src, int src_stride, w, h); } -void vp9_convolve_copy(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int w, int h) { - if (w == 16 && h == 16) { - vp9_copy_mem16x16(src, src_stride, dst, dst_stride); - } else if (w == 8 && h == 8) { - vp9_copy_mem8x8(src, src_stride, dst, dst_stride); - } else if (w == 8 && h == 4) { - vp9_copy_mem8x4(src, src_stride, dst, dst_stride); - } else { - int r; - - for (r = h; r > 0; --r) { - memcpy(dst, src, w); - src += src_stride; - dst += dst_stride; - } +void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h) { + int r; + + for (r = h; r > 0; --r) { + memcpy(dst, src, w); + src += src_stride; + dst += dst_stride; } } -void vp9_convolve_avg(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int w, int h) { +void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h) { int x, y; for (y = 0; y < h; ++y) { diff --git a/vp9/common/vp9_convolve.h b/vp9/common/vp9_convolve.h index 0596080..3de8111 100644 --- a/vp9/common/vp9_convolve.h +++ b/vp9/common/vp9_convolve.h @@ -13,26 +13,12 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" -typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -// Not a convolution, a block copy conforming to the convolution prototype -void vp9_convolve_copy(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h); - -// Not a convolution, a block average conforming to the convolution prototype -void vp9_convolve_avg(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h); - struct subpix_fn_table { const int16_t (*filter_x)[8]; const int16_t (*filter_y)[8]; diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c index 265a19a..c29fd14 100644 --- a/vp9/common/vp9_reconinter.c +++ b/vp9/common/vp9_reconinter.c @@ -194,93 +194,6 @@ void vp9_setup_interp_filters(MACROBLOCKD *xd, assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0); } -void vp9_copy_mem16x16_c(const uint8_t *src, - int src_stride, - uint8_t *dst, - int dst_stride) { - int r; - - for (r = 0; r < 16; r++) { -#if !(CONFIG_FAST_UNALIGNED) - dst[0] = src[0]; - dst[1] = src[1]; - dst[2] = src[2]; - dst[3] = src[3]; - dst[4] = src[4]; - dst[5] = src[5]; - dst[6] = src[6]; - dst[7] = src[7]; - dst[8] = src[8]; - dst[9] = src[9]; - dst[10] = src[10]; - dst[11] = src[11]; - dst[12] = src[12]; - dst[13] = src[13]; - dst[14] = src[14]; - dst[15] = src[15]; - -#else - ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0]; - ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1]; - ((uint32_t *)dst)[2] = ((const uint32_t *)src)[2]; - ((uint32_t *)dst)[3] = ((const uint32_t *)src)[3]; - -#endif - src += src_stride; - dst += dst_stride; - } -} - -void vp9_copy_mem8x8_c(const uint8_t *src, - int src_stride, - uint8_t *dst, - int dst_stride) { - int r; - - for (r = 0; r < 8; r++) { -#if !(CONFIG_FAST_UNALIGNED) - dst[0] = src[0]; - dst[1] = src[1]; - dst[2] = src[2]; - dst[3] = src[3]; - dst[4] = src[4]; - dst[5] = src[5]; - dst[6] = src[6]; - dst[7] = src[7]; -#else - ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0]; - ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1]; -#endif - src += src_stride; - dst += dst_stride; - } -} - -void vp9_copy_mem8x4_c(const uint8_t *src, - int src_stride, - uint8_t *dst, - int dst_stride) { - int r; - - for (r = 0; r < 4; r++) { -#if !(CONFIG_FAST_UNALIGNED) - dst[0] = src[0]; - dst[1] = src[1]; - dst[2] = src[2]; - dst[3] = src[3]; - dst[4] = src[4]; - dst[5] = src[5]; - dst[6] = src[6]; - dst[7] = src[7]; -#else - ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0]; - ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1]; -#endif - src += src_stride; - dst += dst_stride; - } -} - void vp9_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int_mv *src_mv, diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index d861a7a..459336e 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -43,17 +43,6 @@ specialize vp9_idct_add_32x32 # # RECON # -prototype void vp9_copy_mem16x16 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" -specialize vp9_copy_mem16x16 mmx sse2 dspr2 -vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2 - -prototype void vp9_copy_mem8x8 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" -specialize vp9_copy_mem8x8 mmx dspr2 -vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2 - -prototype void vp9_copy_mem8x4 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" -specialize vp9_copy_mem8x4 mmx - prototype void vp9_d27_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" specialize vp9_d27_predictor_4x4 @@ -275,22 +264,28 @@ specialize vp9_blend_b # # Sub Pixel Filters # -prototype void vp9_convolve8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +prototype void vp9_convolve_copy "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve_copy sse2 + +prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve_avg sse2 + +prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" specialize vp9_convolve8 ssse3 -prototype void vp9_convolve8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +prototype void vp9_convolve8_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" specialize vp9_convolve8_horiz ssse3 -prototype void vp9_convolve8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +prototype void vp9_convolve8_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" specialize vp9_convolve8_vert ssse3 -prototype void vp9_convolve8_avg "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +prototype void vp9_convolve8_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" specialize vp9_convolve8_avg ssse3 -prototype void vp9_convolve8_avg_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +prototype void vp9_convolve8_avg_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" specialize vp9_convolve8_avg_horiz ssse3 -prototype void vp9_convolve8_avg_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +prototype void vp9_convolve8_avg_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" specialize vp9_convolve8_avg_vert ssse3 # diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c index 2b66834..98fc4dc 100644 --- a/vp9/common/x86/vp9_asm_stubs.c +++ b/vp9/common/x86/vp9_asm_stubs.c @@ -121,8 +121,8 @@ void vp9_filter_block1d4_h8_avg_ssse3(const unsigned char *src_ptr, unsigned int output_height, const short *filter); -void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { @@ -159,8 +159,8 @@ void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride, } } -void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { @@ -197,8 +197,8 @@ void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride, } } -void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { @@ -235,8 +235,8 @@ void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, int src_stride, } } -void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { @@ -273,8 +273,8 @@ void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, int src_stride, } } -void vp9_convolve8_ssse3(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { @@ -294,8 +294,8 @@ void vp9_convolve8_ssse3(const uint8_t *src, int src_stride, } } -void vp9_convolve8_avg_ssse3(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { diff --git a/vp9/common/x86/vp9_copy_sse2.asm b/vp9/common/x86/vp9_copy_sse2.asm new file mode 100644 index 0000000..dd522c6 --- /dev/null +++ b/vp9/common/x86/vp9_copy_sse2.asm @@ -0,0 +1,152 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro convolve_fn 1 +INIT_XMM sse2 +cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \ + fx, fxs, fy, fys, w, h + mov r4d, dword wm + cmp r4d, 4 + je .w4 + cmp r4d, 8 + je .w8 + cmp r4d, 16 + je .w16 + cmp r4d, 32 + je .w32 + + mov r4d, dword hm +.loop64: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+32] + movu m3, [srcq+48] + add srcq, src_strideq +%ifidn %1, avg + pavgb m0, [dstq] + pavgb m1, [dstq+16] + pavgb m2, [dstq+32] + pavgb m3, [dstq+48] +%endif + mova [dstq ], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + add dstq, dst_strideq + dec r4d + jnz .loop64 + RET + +.w32: + mov r4d, dword hm +.loop32: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+src_strideq] + movu m3, [srcq+src_strideq+16] + lea srcq, [srcq+src_strideq*2] +%ifidn %1, avg + pavgb m0, [dstq] + pavgb m1, [dstq +16] + pavgb m2, [dstq+dst_strideq] + pavgb m3, [dstq+dst_strideq+16] +%endif + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+dst_strideq ], m2 + mova [dstq+dst_strideq+16], m3 + lea dstq, [dstq+dst_strideq*2] + sub r4d, 2 + jnz .loop32 + RET + +.w16: + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] +.loop16: + movu m0, [srcq] + movu m1, [srcq+src_strideq] + movu m2, [srcq+src_strideq*2] + movu m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] +%ifidn %1, avg + pavgb m0, [dstq] + pavgb m1, [dstq+dst_strideq] + pavgb m2, [dstq+dst_strideq*2] + pavgb m3, [dstq+r6q] +%endif + mova [dstq ], m0 + mova [dstq+dst_strideq ], m1 + mova [dstq+dst_strideq*2], m2 + mova [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 + jnz .loop16 + RET + +INIT_MMX sse +.w8: + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] +.loop8: + movu m0, [srcq] + movu m1, [srcq+src_strideq] + movu m2, [srcq+src_strideq*2] + movu m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] +%ifidn %1, avg + pavgb m0, [dstq] + pavgb m1, [dstq+dst_strideq] + pavgb m2, [dstq+dst_strideq*2] + pavgb m3, [dstq+r6q] +%endif + mova [dstq ], m0 + mova [dstq+dst_strideq ], m1 + mova [dstq+dst_strideq*2], m2 + mova [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 + jnz .loop8 + RET + +.w4: + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] +.loop4: + movh m0, [srcq] + movh m1, [srcq+src_strideq] + movh m2, [srcq+src_strideq*2] + movh m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] +%ifidn %1, avg + pavgb m0, [dstq] + pavgb m1, [dstq+dst_strideq] + pavgb m2, [dstq+dst_strideq*2] + pavgb m3, [dstq+r6q] +%endif + movh [dstq ], m0 + movh [dstq+dst_strideq ], m1 + movh [dstq+dst_strideq*2], m2 + movh [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 + jnz .loop4 + RET +%endmacro + +convolve_fn copy +convolve_fn avg diff --git a/vp9/common/x86/vp9_recon_mmx.asm b/vp9/common/x86/vp9_recon_mmx.asm deleted file mode 100644 index 6fbbe48..0000000 --- a/vp9/common/x86/vp9_recon_mmx.asm +++ /dev/null @@ -1,272 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" -;void copy_mem8x8_mmx( -; unsigned char *src, -; int src_stride, -; unsigned char *dst, -; int dst_stride -; ) -global sym(vp9_copy_mem8x8_mmx) PRIVATE -sym(vp9_copy_mem8x8_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src; - movq mm0, [rsi] - - movsxd rax, dword ptr arg(1) ;src_stride; - mov rdi, arg(2) ;dst; - - movq mm1, [rsi+rax] - movq mm2, [rsi+rax*2] - - movsxd rcx, dword ptr arg(3) ;dst_stride - lea rsi, [rsi+rax*2] - - movq [rdi], mm0 - add rsi, rax - - movq [rdi+rcx], mm1 - movq [rdi+rcx*2], mm2 - - - lea rdi, [rdi+rcx*2] - movq mm3, [rsi] - - add rdi, rcx - movq mm4, [rsi+rax] - - movq mm5, [rsi+rax*2] - movq [rdi], mm3 - - lea rsi, [rsi+rax*2] - movq [rdi+rcx], mm4 - - movq [rdi+rcx*2], mm5 - lea rdi, [rdi+rcx*2] - - movq mm0, [rsi+rax] - movq mm1, [rsi+rax*2] - - movq [rdi+rcx], mm0 - movq [rdi+rcx*2],mm1 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void copy_mem8x4_mmx( -; unsigned char *src, -; int src_stride, -; unsigned char *dst, -; int dst_stride -; ) -global sym(vp9_copy_mem8x4_mmx) PRIVATE -sym(vp9_copy_mem8x4_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src; - movq mm0, [rsi] - - movsxd rax, dword ptr arg(1) ;src_stride; - mov rdi, arg(2) ;dst; - - movq mm1, [rsi+rax] - movq mm2, [rsi+rax*2] - - movsxd rcx, dword ptr arg(3) ;dst_stride - lea rsi, [rsi+rax*2] - - movq [rdi], mm0 - movq [rdi+rcx], mm1 - - movq [rdi+rcx*2], mm2 - lea rdi, [rdi+rcx*2] - - movq mm3, [rsi+rax] - movq [rdi+rcx], mm3 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void copy_mem16x16_mmx( -; unsigned char *src, -; int src_stride, -; unsigned char *dst, -; int dst_stride -; ) -global sym(vp9_copy_mem16x16_mmx) PRIVATE -sym(vp9_copy_mem16x16_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src; - movsxd rax, dword ptr arg(1) ;src_stride; - - mov rdi, arg(2) ;dst; - movsxd rcx, dword ptr arg(3) ;dst_stride - - movq mm0, [rsi] - movq mm3, [rsi+8]; - - movq mm1, [rsi+rax] - movq mm4, [rsi+rax+8] - - movq mm2, [rsi+rax*2] - movq mm5, [rsi+rax*2+8] - - lea rsi, [rsi+rax*2] - add rsi, rax - - movq [rdi], mm0 - movq [rdi+8], mm3 - - movq [rdi+rcx], mm1 - movq [rdi+rcx+8], mm4 - - movq [rdi+rcx*2], mm2 - movq [rdi+rcx*2+8], mm5 - - lea rdi, [rdi+rcx*2] - add rdi, rcx - - movq mm0, [rsi] - movq mm3, [rsi+8]; - - movq mm1, [rsi+rax] - movq mm4, [rsi+rax+8] - - movq mm2, [rsi+rax*2] - movq mm5, [rsi+rax*2+8] - - lea rsi, [rsi+rax*2] - add rsi, rax - - movq [rdi], mm0 - movq [rdi+8], mm3 - - movq [rdi+rcx], mm1 - movq [rdi+rcx+8], mm4 - - movq [rdi+rcx*2], mm2 - movq [rdi+rcx*2+8], mm5 - - lea rdi, [rdi+rcx*2] - add rdi, rcx - - movq mm0, [rsi] - movq mm3, [rsi+8]; - - movq mm1, [rsi+rax] - movq mm4, [rsi+rax+8] - - movq mm2, [rsi+rax*2] - movq mm5, [rsi+rax*2+8] - - lea rsi, [rsi+rax*2] - add rsi, rax - - movq [rdi], mm0 - movq [rdi+8], mm3 - - movq [rdi+rcx], mm1 - movq [rdi+rcx+8], mm4 - - movq [rdi+rcx*2], mm2 - movq [rdi+rcx*2+8], mm5 - - lea rdi, [rdi+rcx*2] - add rdi, rcx - - movq mm0, [rsi] - movq mm3, [rsi+8]; - - movq mm1, [rsi+rax] - movq mm4, [rsi+rax+8] - - movq mm2, [rsi+rax*2] - movq mm5, [rsi+rax*2+8] - - lea rsi, [rsi+rax*2] - add rsi, rax - - movq [rdi], mm0 - movq [rdi+8], mm3 - - movq [rdi+rcx], mm1 - movq [rdi+rcx+8], mm4 - - movq [rdi+rcx*2], mm2 - movq [rdi+rcx*2+8], mm5 - - lea rdi, [rdi+rcx*2] - add rdi, rcx - - movq mm0, [rsi] - movq mm3, [rsi+8]; - - movq mm1, [rsi+rax] - movq mm4, [rsi+rax+8] - - movq mm2, [rsi+rax*2] - movq mm5, [rsi+rax*2+8] - - lea rsi, [rsi+rax*2] - add rsi, rax - - movq [rdi], mm0 - movq [rdi+8], mm3 - - movq [rdi+rcx], mm1 - movq [rdi+rcx+8], mm4 - - movq [rdi+rcx*2], mm2 - movq [rdi+rcx*2+8], mm5 - - lea rdi, [rdi+rcx*2] - add rdi, rcx - - movq mm0, [rsi] - movq mm3, [rsi+8]; - - movq [rdi], mm0 - movq [rdi+8], mm3 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret diff --git a/vp9/common/x86/vp9_recon_sse2.asm b/vp9/common/x86/vp9_recon_sse2.asm deleted file mode 100644 index f7cc611..0000000 --- a/vp9/common/x86/vp9_recon_sse2.asm +++ /dev/null @@ -1,115 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" -;void copy_mem16x16_sse2( -; unsigned char *src, -; int src_stride, -; unsigned char *dst, -; int dst_stride -; ) -global sym(vp9_copy_mem16x16_sse2) PRIVATE -sym(vp9_copy_mem16x16_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src; - movdqu xmm0, [rsi] - - movsxd rax, dword ptr arg(1) ;src_stride; - mov rdi, arg(2) ;dst; - - movdqu xmm1, [rsi+rax] - movdqu xmm2, [rsi+rax*2] - - movsxd rcx, dword ptr arg(3) ;dst_stride - lea rsi, [rsi+rax*2] - - movdqa [rdi], xmm0 - add rsi, rax - - movdqa [rdi+rcx], xmm1 - movdqa [rdi+rcx*2],xmm2 - - lea rdi, [rdi+rcx*2] - movdqu xmm3, [rsi] - - add rdi, rcx - movdqu xmm4, [rsi+rax] - - movdqu xmm5, [rsi+rax*2] - lea rsi, [rsi+rax*2] - - movdqa [rdi], xmm3 - add rsi, rax - - movdqa [rdi+rcx], xmm4 - movdqa [rdi+rcx*2],xmm5 - - lea rdi, [rdi+rcx*2] - movdqu xmm0, [rsi] - - add rdi, rcx - movdqu xmm1, [rsi+rax] - - movdqu xmm2, [rsi+rax*2] - lea rsi, [rsi+rax*2] - - movdqa [rdi], xmm0 - add rsi, rax - - movdqa [rdi+rcx], xmm1 - - movdqa [rdi+rcx*2], xmm2 - movdqu xmm3, [rsi] - - movdqu xmm4, [rsi+rax] - lea rdi, [rdi+rcx*2] - - add rdi, rcx - movdqu xmm5, [rsi+rax*2] - - lea rsi, [rsi+rax*2] - movdqa [rdi], xmm3 - - add rsi, rax - movdqa [rdi+rcx], xmm4 - - movdqa [rdi+rcx*2],xmm5 - movdqu xmm0, [rsi] - - lea rdi, [rdi+rcx*2] - movdqu xmm1, [rsi+rax] - - add rdi, rcx - movdqu xmm2, [rsi+rax*2] - - lea rsi, [rsi+rax*2] - movdqa [rdi], xmm0 - - movdqa [rdi+rcx], xmm1 - movdqa [rdi+rcx*2],xmm2 - - movdqu xmm3, [rsi+rax] - lea rdi, [rdi+rcx*2] - - movdqa [rdi+rcx], xmm3 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 1079e20..ee744d5 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -75,10 +75,9 @@ VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.h VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c -VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_recon_mmx.asm VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_copy_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_intrapred_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_intrapred_ssse3.asm VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm