From c9b25dcae4f492b558f2025d96664f4d4b2b42ea Mon Sep 17 00:00:00 2001 From: hkuang Date: Tue, 9 Jul 2013 12:06:21 -0700 Subject: [PATCH] Add neon optimize vp9_dc_only_idct_add. Change-Id: Iae84ab945cc9662a0ddd839aa2b9ca59f2ae5423 --- vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm | 69 +++++++++++++++++++++++ vp9/common/vp9_rtcd_defs.sh | 2 +- vp9/vp9_common.mk | 1 + 3 files changed, 71 insertions(+), 1 deletion(-) create mode 100644 vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm diff --git a/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm b/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm new file mode 100644 index 0000000..60a0d98 --- /dev/null +++ b/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm @@ -0,0 +1,69 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp9_dc_only_idct_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void vp9_dc_only_idct_add_neon(int input_dc, uint8_t *pred_ptr, +; uint8_t *dst_ptr, int pitch, int stride) +; +; r0 int input_dc +; r1 uint8_t *pred_ptr +; r2 uint8_t *dst_ptr +; r3 int pitch +; sp int stride + +|vp9_dc_only_idct_add_neon| PROC + + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + ; dct_const_round_shift(input_dc * cospi_16_64) + mul r0, r0, r12 ; input_dc * cospi_16_64 + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; dct_const_round_shift(out * cospi_16_64) + mul r0, r0, r12 ; out * cospi_16_64 + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; ROUND_POWER_OF_TWO(out, 4) + add r0, r0, #8 ; + (1 <<((4) - 1)) + asr r0, r0, #4 ; >> 4 + + vdup.16 q0, r0; ; duplicate a1 + ldr r12, [sp] ; load stride + + vld1.32 {d2[0]}, [r1], r3 + vld1.32 {d2[1]}, [r1], r3 + vld1.32 {d4[0]}, [r1], r3 + vld1.32 {d4[1]}, [r1] + + vaddw.u8 q1, q0, d2 ; a1 + pred_ptr[c] + vaddw.u8 q2, q0, d4 + + vqmovun.s16 d2, q1 ; clip_pixel + vqmovun.s16 d4, q2 + + vst1.32 {d2[0]}, [r2], r12 + vst1.32 {d2[1]}, [r2], r12 + vst1.32 {d4[0]}, [r2], r12 + vst1.32 {d4[1]}, [r2] + + bx lr + ENDP ; |vp9_dc_only_idct_add_neon| + + END diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 56a2284..8d0eb51 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -338,7 +338,7 @@ specialize vp9_idct4_1d sse2 # dct and add prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride" -specialize vp9_dc_only_idct_add sse2 +specialize vp9_dc_only_idct_add sse2 neon prototype void vp9_short_iwalsh4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_iwalsh4x4_1_add diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index ee744d5..75548c6 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -89,5 +89,6 @@ endif VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM) $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.sh)) -- 2.7.4