From: Jingning Han Date: Mon, 6 Jul 2015 23:52:24 +0000 (-0700) Subject: Unify subtract function used in VP8/9 X-Git-Tag: v1.5.0~471^2~4 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=0ede9f52b796b6d8e02046b24f68a3db8b9f5920;p=platform%2Fupstream%2Flibvpx.git Unify subtract function used in VP8/9 This commit replaces the vp8_ prefixed subtract function with the common vpx_subtract_block function. It removes redundant SIMD optimization codes and unit tests. Change-Id: I42e086c32c93c6125e452dcaa6ed04337fe028d9 --- diff --git a/test/subtract_test.cc b/test/subtract_test.cc deleted file mode 100644 index ff42725..0000000 --- a/test/subtract_test.cc +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "third_party/googletest/src/include/gtest/gtest.h" -#include "test/acm_random.h" -#include "test/clear_system_state.h" -#include "test/register_state_check.h" -#include "./vpx_config.h" -#include "./vp8_rtcd.h" -#include "vp8/common/blockd.h" -#include "vp8/encoder/block.h" -#include "vpx_mem/vpx_mem.h" - -typedef void (*SubtractBlockFunc)(BLOCK *be, BLOCKD *bd, int pitch); - -namespace { - -class SubtractBlockTest : public ::testing::TestWithParam { - public: - virtual void TearDown() { - libvpx_test::ClearSystemState(); - } -}; - -using libvpx_test::ACMRandom; - -TEST_P(SubtractBlockTest, SimpleSubtract) { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - BLOCK be; - BLOCKD bd; - // in libvpx, this stride is always 16 - const int kDiffPredStride = 16; - const int kSrcStride[] = {32, 16, 8, 4, 0}; - const int kBlockWidth = 4; - const int kBlockHeight = 4; - - // Allocate... align to 16 for mmx/sse tests - uint8_t *source = reinterpret_cast( - vpx_memalign(16, kBlockHeight * kSrcStride[0] * sizeof(*source))); - be.src_diff = reinterpret_cast( - vpx_memalign(16, kBlockHeight * kDiffPredStride * sizeof(*be.src_diff))); - bd.predictor = reinterpret_cast( - vpx_memalign(16, kBlockHeight * kDiffPredStride * sizeof(*bd.predictor))); - - for (int i = 0; kSrcStride[i] > 0; ++i) { - // start at block0 - be.src = 0; - be.base_src = &source; - be.src_stride = kSrcStride[i]; - - // set difference - int16_t *src_diff = be.src_diff; - for (int r = 0; r < kBlockHeight; ++r) { - for (int c = 0; c < kBlockWidth; ++c) { - src_diff[c] = static_cast(0xa5a5u); - } - src_diff += kDiffPredStride; - } - - // set destination - uint8_t *base_src = *be.base_src; - for (int r = 0; r < kBlockHeight; ++r) { - for (int c = 0; c < kBlockWidth; ++c) { - base_src[c] = rnd.Rand8(); - } - base_src += be.src_stride; - } - - // set predictor - uint8_t *predictor = bd.predictor; - for (int r = 0; r < kBlockHeight; ++r) { - for (int c = 0; c < kBlockWidth; ++c) { - predictor[c] = rnd.Rand8(); - } - predictor += kDiffPredStride; - } - - ASM_REGISTER_STATE_CHECK(GetParam()(&be, &bd, kDiffPredStride)); - - base_src = *be.base_src; - src_diff = be.src_diff; - predictor = bd.predictor; - for (int r = 0; r < kBlockHeight; ++r) { - for (int c = 0; c < kBlockWidth; ++c) { - EXPECT_EQ(base_src[c], (src_diff[c] + predictor[c])) << "r = " << r - << ", c = " << c; - } - src_diff += kDiffPredStride; - predictor += kDiffPredStride; - base_src += be.src_stride; - } - } - vpx_free(be.src_diff); - vpx_free(source); - vpx_free(bd.predictor); -} - -INSTANTIATE_TEST_CASE_P(C, SubtractBlockTest, - ::testing::Values(vp8_subtract_b_c)); - -#if HAVE_NEON -INSTANTIATE_TEST_CASE_P(NEON, SubtractBlockTest, - ::testing::Values(vp8_subtract_b_neon)); -#endif - -#if HAVE_MMX -INSTANTIATE_TEST_CASE_P(MMX, SubtractBlockTest, - ::testing::Values(vp8_subtract_b_mmx)); -#endif - -#if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P(SSE2, SubtractBlockTest, - ::testing::Values(vp8_subtract_b_sse2)); -#endif - -} // namespace diff --git a/test/test.mk b/test/test.mk index 8415117..a8a365e 100644 --- a/test/test.mk +++ b/test/test.mk @@ -104,7 +104,6 @@ endif LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC) += pp_filter_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc -LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += subtract_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += quantize_test.cc diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index fed2088..960c131 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -343,15 +343,6 @@ add_proto qw/int vp8_mbuverror/, "struct macroblock *mb"; specialize qw/vp8_mbuverror mmx sse2/; $vp8_mbuverror_sse2=vp8_mbuverror_xmm; -add_proto qw/void vp8_subtract_b/, "struct block *be, struct blockd *bd, int pitch"; -specialize qw/vp8_subtract_b mmx sse2 neon/; - -add_proto qw/void vp8_subtract_mby/, "short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride"; -specialize qw/vp8_subtract_mby mmx sse2 neon/; - -add_proto qw/void vp8_subtract_mbuv/, "short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride"; -specialize qw/vp8_subtract_mbuv mmx sse2 neon/; - # # Motion search # diff --git a/vp8/encoder/arm/neon/subtract_neon.c b/vp8/encoder/arm/neon/subtract_neon.c deleted file mode 100644 index d3ab7b1..0000000 --- a/vp8/encoder/arm/neon/subtract_neon.c +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include "vp8/encoder/block.h" - -void vp8_subtract_b_neon( - BLOCK *be, - BLOCKD *bd, - int pitch) { - unsigned char *src_ptr, *predictor; - int src_stride; - int16_t *src_diff; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; - uint16x8_t q10u16, q11u16, q12u16, q13u16; - - src_ptr = *be->base_src + be->src; - src_stride = be->src_stride; - predictor = bd->predictor; - - d0u8 = vld1_u8(src_ptr); - src_ptr += src_stride; - d2u8 = vld1_u8(src_ptr); - src_ptr += src_stride; - d4u8 = vld1_u8(src_ptr); - src_ptr += src_stride; - d6u8 = vld1_u8(src_ptr); - - d1u8 = vld1_u8(predictor); - predictor += pitch; - d3u8 = vld1_u8(predictor); - predictor += pitch; - d5u8 = vld1_u8(predictor); - predictor += pitch; - d7u8 = vld1_u8(predictor); - - q10u16 = vsubl_u8(d0u8, d1u8); - q11u16 = vsubl_u8(d2u8, d3u8); - q12u16 = vsubl_u8(d4u8, d5u8); - q13u16 = vsubl_u8(d6u8, d7u8); - - src_diff = be->src_diff; - vst1_u16((uint16_t *)src_diff, vget_low_u16(q10u16)); - src_diff += pitch; - vst1_u16((uint16_t *)src_diff, vget_low_u16(q11u16)); - src_diff += pitch; - vst1_u16((uint16_t *)src_diff, vget_low_u16(q12u16)); - src_diff += pitch; - vst1_u16((uint16_t *)src_diff, vget_low_u16(q13u16)); - return; -} - -void vp8_subtract_mby_neon( - int16_t *diff, - unsigned char *src, - int src_stride, - unsigned char *pred, - int pred_stride) { - int i; - uint8x16_t q0u8, q1u8, q2u8, q3u8; - uint16x8_t q8u16, q9u16, q10u16, q11u16; - - for (i = 0; i < 8; i++) { // subtract_mby_loop - q0u8 = vld1q_u8(src); - src += src_stride; - q2u8 = vld1q_u8(src); - src += src_stride; - q1u8 = vld1q_u8(pred); - pred += pred_stride; - q3u8 = vld1q_u8(pred); - pred += pred_stride; - - q8u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q1u8)); - q9u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q1u8)); - q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q3u8)); - q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q3u8)); - - vst1q_u16((uint16_t *)diff, q8u16); - diff += 8; - vst1q_u16((uint16_t *)diff, q9u16); - diff += 8; - vst1q_u16((uint16_t *)diff, q10u16); - diff += 8; - vst1q_u16((uint16_t *)diff, q11u16); - diff += 8; - } - return; -} - -void vp8_subtract_mbuv_neon( - int16_t *diff, - unsigned char *usrc, - unsigned char *vsrc, - int src_stride, - unsigned char *upred, - unsigned char *vpred, - int pred_stride) { - int i, j; - unsigned char *src_ptr, *pred_ptr; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; - uint16x8_t q8u16, q9u16, q10u16, q11u16; - - diff += 256; - for (i = 0; i < 2; i++) { - if (i == 0) { - src_ptr = usrc; - pred_ptr = upred; - } else if (i == 1) { - src_ptr = vsrc; - pred_ptr = vpred; - } - - for (j = 0; j < 2; j++) { - d0u8 = vld1_u8(src_ptr); - src_ptr += src_stride; - d1u8 = vld1_u8(pred_ptr); - pred_ptr += pred_stride; - d2u8 = vld1_u8(src_ptr); - src_ptr += src_stride; - d3u8 = vld1_u8(pred_ptr); - pred_ptr += pred_stride; - d4u8 = vld1_u8(src_ptr); - src_ptr += src_stride; - d5u8 = vld1_u8(pred_ptr); - pred_ptr += pred_stride; - d6u8 = vld1_u8(src_ptr); - src_ptr += src_stride; - d7u8 = vld1_u8(pred_ptr); - pred_ptr += pred_stride; - - q8u16 = vsubl_u8(d0u8, d1u8); - q9u16 = vsubl_u8(d2u8, d3u8); - q10u16 = vsubl_u8(d4u8, d5u8); - q11u16 = vsubl_u8(d6u8, d7u8); - - vst1q_u16((uint16_t *)diff, q8u16); - diff += 8; - vst1q_u16((uint16_t *)diff, q9u16); - diff += 8; - vst1q_u16((uint16_t *)diff, q10u16); - diff += 8; - vst1q_u16((uint16_t *)diff, q11u16); - diff += 8; - } - } - return; -} diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c index 820b137..cf180c1 100644 --- a/vp8/encoder/encodemb.c +++ b/vp8/encoder/encodemb.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_dsp_rtcd.h" #include "vpx_config.h" #include "vp8_rtcd.h" @@ -19,80 +20,29 @@ #include "vpx_mem/vpx_mem.h" #include "rdopt.h" -// TODO(jingning,johannkoenig): use vpx_subtract_block to replace -// codec specified vp9_subtract_ functions. -void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch) -{ - unsigned char *src_ptr = (*(be->base_src) + be->src); - short *diff_ptr = be->src_diff; - unsigned char *pred_ptr = bd->predictor; - int src_stride = be->src_stride; - - int r, c; +void vp8_subtract_b(BLOCK *be, BLOCKD *bd, int pitch) { + unsigned char *src_ptr = (*(be->base_src) + be->src); + short *diff_ptr = be->src_diff; + unsigned char *pred_ptr = bd->predictor; + int src_stride = be->src_stride; - for (r = 0; r < 4; r++) - { - for (c = 0; c < 4; c++) - { - diff_ptr[c] = src_ptr[c] - pred_ptr[c]; - } - - diff_ptr += pitch; - pred_ptr += pitch; - src_ptr += src_stride; - } + vpx_subtract_block(4, 4, diff_ptr, pitch, src_ptr, src_stride, + pred_ptr, pitch); } -void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, +void vp8_subtract_mbuv(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, - unsigned char *vpred, int pred_stride) -{ - short *udiff = diff + 256; - short *vdiff = diff + 320; - - int r, c; + unsigned char *vpred, int pred_stride) { + short *udiff = diff + 256; + short *vdiff = diff + 320; - for (r = 0; r < 8; r++) - { - for (c = 0; c < 8; c++) - { - udiff[c] = usrc[c] - upred[c]; - } - - udiff += 8; - upred += pred_stride; - usrc += src_stride; - } - - for (r = 0; r < 8; r++) - { - for (c = 0; c < 8; c++) - { - vdiff[c] = vsrc[c] - vpred[c]; - } - - vdiff += 8; - vpred += pred_stride; - vsrc += src_stride; - } + vpx_subtract_block(8, 8, udiff, 8, usrc, src_stride, upred, pred_stride); + vpx_subtract_block(8, 8, vdiff, 8, vsrc, src_stride, vpred, pred_stride); } -void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, - unsigned char *pred, int pred_stride) -{ - int r, c; - - for (r = 0; r < 16; r++) - { - for (c = 0; c < 16; c++) - { - diff[c] = src[c] - pred[c]; - } - - diff += 16; - pred += pred_stride; - src += src_stride; - } +void vp8_subtract_mby(short *diff, unsigned char *src, int src_stride, + unsigned char *pred, int pred_stride) { + vpx_subtract_block(16, 16, diff, 16, src, src_stride, pred, pred_stride); } static void vp8_subtract_mb(MACROBLOCK *x) diff --git a/vp8/encoder/encodemb.h b/vp8/encoder/encodemb.h index 0b3ec87..10b3d86 100644 --- a/vp8/encoder/encodemb.h +++ b/vp8/encoder/encodemb.h @@ -19,6 +19,13 @@ extern "C" { #endif void vp8_encode_inter16x16(MACROBLOCK *x); +void vp8_subtract_b(BLOCK *be, BLOCKD *bd, int pitch); +void vp8_subtract_mbuv(short *diff, unsigned char *usrc, unsigned char *vsrc, + int src_stride, unsigned char *upred, + unsigned char *vpred, int pred_stride); +void vp8_subtract_mby(short *diff, unsigned char *src, int src_stride, + unsigned char *pred, int pred_stride); + void vp8_build_dcblock(MACROBLOCK *b); void vp8_transform_mb(MACROBLOCK *mb); void vp8_transform_mbuv(MACROBLOCK *x); diff --git a/vp8/encoder/x86/subtract_mmx.asm b/vp8/encoder/x86/subtract_mmx.asm deleted file mode 100644 index 794dd22..0000000 --- a/vp8/encoder/x86/subtract_mmx.asm +++ /dev/null @@ -1,223 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride, -; short *diff, unsigned char *Predictor, -; int pitch); -global sym(vp8_subtract_b_mmx_impl) PRIVATE -sym(vp8_subtract_b_mmx_impl): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - - mov rdi, arg(2) ;diff - mov rax, arg(3) ;Predictor - mov rsi, arg(0) ;z - movsxd rdx, dword ptr arg(1);src_stride; - movsxd rcx, dword ptr arg(4);pitch - pxor mm7, mm7 - - movd mm0, [rsi] - movd mm1, [rax] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq [rdi], mm0 - - - movd mm0, [rsi+rdx] - movd mm1, [rax+rcx] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq [rdi+rcx*2],mm0 - - - movd mm0, [rsi+rdx*2] - movd mm1, [rax+rcx*2] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq [rdi+rcx*4], mm0 - - lea rsi, [rsi+rdx*2] - lea rcx, [rcx+rcx*2] - - - - movd mm0, [rsi+rdx] - movd mm1, [rax+rcx] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq [rdi+rcx*2], mm0 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride, -;unsigned char *pred, int pred_stride) -global sym(vp8_subtract_mby_mmx) PRIVATE -sym(vp8_subtract_mby_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rdi, arg(0) ;diff - mov rsi, arg(1) ;src - movsxd rdx, dword ptr arg(2);src_stride - mov rax, arg(3) ;pred - push rbx - movsxd rbx, dword ptr arg(4);pred_stride - - pxor mm0, mm0 - mov rcx, 16 - - -.submby_loop: - movq mm1, [rsi] - movq mm3, [rax] - - movq mm2, mm1 - movq mm4, mm3 - - punpcklbw mm1, mm0 - punpcklbw mm3, mm0 - - punpckhbw mm2, mm0 - punpckhbw mm4, mm0 - - psubw mm1, mm3 - psubw mm2, mm4 - - movq [rdi], mm1 - movq [rdi+8], mm2 - - movq mm1, [rsi+8] - movq mm3, [rax+8] - - movq mm2, mm1 - movq mm4, mm3 - - punpcklbw mm1, mm0 - punpcklbw mm3, mm0 - - punpckhbw mm2, mm0 - punpckhbw mm4, mm0 - - psubw mm1, mm3 - psubw mm2, mm4 - - movq [rdi+16], mm1 - movq [rdi+24], mm2 - add rdi, 32 - lea rax, [rax+rbx] - lea rsi, [rsi+rdx] - dec rcx - jnz .submby_loop - - pop rbx - pop rdi - pop rsi - ; begin epilog - UNSHADOW_ARGS - pop rbp - ret - - -;vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, -; int src_stride, unsigned char *upred, -; unsigned char *vpred, int pred_stride) - -global sym(vp8_subtract_mbuv_mmx) PRIVATE -sym(vp8_subtract_mbuv_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - push rsi - push rdi - ; end prolog - - mov rdi, arg(0) ;diff - mov rsi, arg(1) ;usrc - movsxd rdx, dword ptr arg(3);src_stride; - mov rax, arg(4) ;upred - add rdi, 256*2 ;diff = diff + 256 (shorts) - mov rcx, 8 - push rbx - movsxd rbx, dword ptr arg(6);pred_stride - - pxor mm7, mm7 - -.submbu_loop: - movq mm0, [rsi] - movq mm1, [rax] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi], mm0 - movq [rdi+8], mm3 - add rdi, 16 - add rsi, rdx - add rax, rbx - - dec rcx - jnz .submbu_loop - - mov rsi, arg(2) ;vsrc - mov rax, arg(5) ;vpred - mov rcx, 8 - -.submbv_loop: - movq mm0, [rsi] - movq mm1, [rax] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi], mm0 - movq [rdi+8], mm3 - add rdi, 16 - add rsi, rdx - add rax, rbx - - dec rcx - jnz .submbv_loop - - pop rbx - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret diff --git a/vp8/encoder/x86/subtract_sse2.asm b/vp8/encoder/x86/subtract_sse2.asm deleted file mode 100644 index a5d17f5..0000000 --- a/vp8/encoder/x86/subtract_sse2.asm +++ /dev/null @@ -1,245 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;void vp8_subtract_b_sse2_impl(unsigned char *z, int src_stride, -; short *diff, unsigned char *Predictor, -; int pitch); -global sym(vp8_subtract_b_sse2_impl) PRIVATE -sym(vp8_subtract_b_sse2_impl): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdi, arg(2) ;diff - mov rax, arg(3) ;Predictor - mov rsi, arg(0) ;z - movsxd rdx, dword ptr arg(1);src_stride; - movsxd rcx, dword ptr arg(4);pitch - pxor mm7, mm7 - - movd mm0, [rsi] - movd mm1, [rax] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq MMWORD PTR [rdi], mm0 - - movd mm0, [rsi+rdx] - movd mm1, [rax+rcx] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq MMWORD PTR [rdi+rcx*2], mm0 - - movd mm0, [rsi+rdx*2] - movd mm1, [rax+rcx*2] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq MMWORD PTR [rdi+rcx*4], mm0 - - lea rsi, [rsi+rdx*2] - lea rcx, [rcx+rcx*2] - - movd mm0, [rsi+rdx] - movd mm1, [rax+rcx] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq MMWORD PTR [rdi+rcx*2], mm0 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride, -;unsigned char *pred, int pred_stride) -global sym(vp8_subtract_mby_sse2) PRIVATE -sym(vp8_subtract_mby_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdi, arg(0) ;diff - mov rsi, arg(1) ;src - movsxd rdx, dword ptr arg(2);src_stride - mov rax, arg(3) ;pred - movdqa xmm4, [GLOBAL(t80)] - push rbx - mov rcx, 8 ; do two lines at one time - movsxd rbx, dword ptr arg(4);pred_stride - -.submby_loop: - movdqa xmm0, [rsi] ; src - movdqa xmm1, [rax] ; pred - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 - - pxor xmm1, xmm4 ;convert to signed values - pxor xmm2, xmm4 - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm1 ; put sign back to subtraction - - movdqa xmm3, [rsi + rdx] - movdqa xmm5, [rax + rbx] - - lea rsi, [rsi+rdx*2] - lea rax, [rax+rbx*2] - - movdqa [rdi], xmm0 - movdqa [rdi +16], xmm2 - - movdqa xmm1, xmm3 - psubb xmm3, xmm5 - - pxor xmm5, xmm4 ;convert to signed values - pxor xmm1, xmm4 - pcmpgtb xmm5, xmm1 ; obtain sign information - - movdqa xmm1, xmm3 - punpcklbw xmm3, xmm5 ; put sign back to subtraction - punpckhbw xmm1, xmm5 ; put sign back to subtraction - - movdqa [rdi +32], xmm3 - movdqa [rdi +48], xmm1 - - add rdi, 64 - dec rcx - jnz .submby_loop - - pop rbx - pop rdi - pop rsi - ; begin epilog - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -;vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, -; int src_stride, unsigned char *upred, -; unsigned char *vpred, int pred_stride) -global sym(vp8_subtract_mbuv_sse2) PRIVATE -sym(vp8_subtract_mbuv_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movdqa xmm4, [GLOBAL(t80)] - mov rdi, arg(0) ;diff - mov rsi, arg(1) ;usrc - movsxd rdx, dword ptr arg(3);src_stride; - mov rax, arg(4) ;upred - add rdi, 256*2 ;diff = diff + 256 (shorts) - mov rcx, 4 - push rbx - movsxd rbx, dword ptr arg(6);pred_stride - - ;u -.submbu_loop: - movq xmm0, [rsi] ; src - movq xmm2, [rsi+rdx] ; src -- next line - movq xmm1, [rax] ; pred - movq xmm3, [rax+rbx] ; pred -- next line - lea rsi, [rsi + rdx*2] - lea rax, [rax + rbx*2] - - punpcklqdq xmm0, xmm2 - punpcklqdq xmm1, xmm3 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, xmm4 ;convert to signed values - pxor xmm2, xmm4 - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa [rdi], xmm0 ; store difference - movdqa [rdi +16], xmm2 ; store difference - add rdi, 32 - sub rcx, 1 - jnz .submbu_loop - - mov rsi, arg(2) ;vsrc - mov rax, arg(5) ;vpred - mov rcx, 4 - - ;v -.submbv_loop: - movq xmm0, [rsi] ; src - movq xmm2, [rsi+rdx] ; src -- next line - movq xmm1, [rax] ; pred - movq xmm3, [rax+rbx] ; pred -- next line - lea rsi, [rsi + rdx*2] - lea rax, [rax + rbx*2] - - punpcklqdq xmm0, xmm2 - punpcklqdq xmm1, xmm3 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, xmm4 ;convert to signed values - pxor xmm2, xmm4 - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa [rdi], xmm0 ; store difference - movdqa [rdi +16], xmm2 ; store difference - add rdi, 32 - sub rcx, 1 - jnz .submbv_loop - - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -t80: - times 16 db 0x80 diff --git a/vp8/encoder/x86/vp8_enc_stubs_mmx.c b/vp8/encoder/x86/vp8_enc_stubs_mmx.c index cf3d8ca..7bf5155 100644 --- a/vp8/encoder/x86/vp8_enc_stubs_mmx.c +++ b/vp8/encoder/x86/vp8_enc_stubs_mmx.c @@ -65,14 +65,3 @@ int vp8_mbuverror_mmx(MACROBLOCK *mb) return vp8_mbuverror_mmx_impl(s_ptr, d_ptr); } -void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride, - short *diff, unsigned char *predictor, - int pitch); -void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) -{ - unsigned char *z = *(be->base_src) + be->src; - unsigned int src_stride = be->src_stride; - short *diff = &be->src_diff[0]; - unsigned char *predictor = &bd->predictor[0]; - vp8_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch); -} diff --git a/vp8/encoder/x86/vp8_enc_stubs_sse2.c b/vp8/encoder/x86/vp8_enc_stubs_sse2.c index 3dfbee3..be9aaf3 100644 --- a/vp8/encoder/x86/vp8_enc_stubs_sse2.c +++ b/vp8/encoder/x86/vp8_enc_stubs_sse2.c @@ -30,14 +30,3 @@ int vp8_mbuverror_xmm(MACROBLOCK *mb) return vp8_mbuverror_xmm_impl(s_ptr, d_ptr); } -void vp8_subtract_b_sse2_impl(unsigned char *z, int src_stride, - short *diff, unsigned char *predictor, - int pitch); -void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch) -{ - unsigned char *z = *(be->base_src) + be->src; - unsigned int src_stride = be->src_stride; - short *diff = &be->src_diff[0]; - unsigned char *predictor = &bd->predictor[0]; - vp8_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch); -} diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index 5e4ef05..99d40ec 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -82,7 +82,6 @@ VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.c endif VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm -VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/subtract_mmx.asm VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm @@ -94,7 +93,6 @@ ifeq ($(CONFIG_TEMPORAL_DENOISING),yes) VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c endif -VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk index 0b0f6a7..838b53d 100644 --- a/vp8/vp8cx_arm.mk +++ b/vp8/vp8cx_arm.mk @@ -25,5 +25,4 @@ VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/walsh_v6$(ASM) VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/denoising_neon.c VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/fastquantizeb_neon.c VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/shortfdct_neon.c -VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/subtract_neon.c VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_shortwalsh4x4_neon.c