From 1ef86980b9e39ac2775e681f839cdeba77404357 Mon Sep 17 00:00:00 2001 From: Tero Rintaluoma Date: Thu, 10 Feb 2011 16:41:22 +0200 Subject: [PATCH] ARMv6 optimized sad16x16 Adds a new ARMv6 optimized function vp8_sad16x16_armv6 to encoder. Change-Id: Ibbd7edb8b25cb7a5b522d391b1e9a690fe150e57 --- vp8/encoder/arm/arm_csystemdependent.c | 4 +- vp8/encoder/arm/armv6/vp8_sad16x16_armv6.asm | 84 ++++++++++++++++++++++++++++ vp8/encoder/arm/variance_arm.h | 4 ++ vp8/vp8cx_arm.mk | 1 + 4 files changed, 91 insertions(+), 2 deletions(-) create mode 100644 vp8/encoder/arm/armv6/vp8_sad16x16_armv6.asm diff --git a/vp8/encoder/arm/arm_csystemdependent.c b/vp8/encoder/arm/arm_csystemdependent.c index 6c17a79..cec35d5 100644 --- a/vp8/encoder/arm/arm_csystemdependent.c +++ b/vp8/encoder/arm/arm_csystemdependent.c @@ -29,8 +29,8 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi) #if HAVE_ARMV6 if (has_media) { - /*cpi->rtcd.variance.sad16x16 = vp8_sad16x16_c; - cpi->rtcd.variance.sad16x8 = vp8_sad16x8_c; + cpi->rtcd.variance.sad16x16 = vp8_sad16x16_armv6; + /*cpi->rtcd.variance.sad16x8 = vp8_sad16x8_c; cpi->rtcd.variance.sad8x16 = vp8_sad8x16_c; cpi->rtcd.variance.sad8x8 = vp8_sad8x8_c; cpi->rtcd.variance.sad4x4 = vp8_sad4x4_c;*/ diff --git a/vp8/encoder/arm/armv6/vp8_sad16x16_armv6.asm b/vp8/encoder/arm/armv6/vp8_sad16x16_armv6.asm new file mode 100644 index 0000000..c759f7c --- /dev/null +++ b/vp8/encoder/arm/armv6/vp8_sad16x16_armv6.asm @@ -0,0 +1,84 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_sad16x16_armv6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 const unsigned char *src_ptr +; r1 int src_stride +; r2 const unsigned char *ref_ptr +; r3 int ref_stride +; stack max_sad (not used) +|vp8_sad16x16_armv6| PROC + stmfd sp!, {r4-r12, lr} + mov r4, #0 ; sad = 0; + mov r5, #8 ; loop count + +loop + ; 1st row + ldr r6, [r0, #0x0] ; load 4 src pixels (1A) + ldr r8, [r2, #0x0] ; load 4 ref pixels (1A) + ldr r7, [r0, #0x4] ; load 4 src pixels (1A) + ldr r9, [r2, #0x4] ; load 4 ref pixels (1A) + ldr r10, [r0, #0x8] ; load 4 src pixels (1B) + ldr r11, [r0, #0xC] ; load 4 src pixels (1B) + + usada8 r4, r8, r6, r4 ; calculate sad for 4 pixels + usad8 r8, r7, r9 ; calculate sad for 4 pixels + + ldr r12, [r2, #0x8] ; load 4 ref pixels (1B) + ldr lr, [r2, #0xC] ; load 4 ref pixels (1B) + + add r0, r0, r1 ; set src pointer to next row + add r2, r2, r3 ; set dst pointer to next row + + usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels + usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels + + ldr r6, [r0, #0x0] ; load 4 src pixels (2A) + ldr r7, [r0, #0x4] ; load 4 src pixels (2A) + add r4, r4, r8 ; add partial sad values + + ; 2nd row + ldr r8, [r2, #0x0] ; load 4 ref pixels (2A) + ldr r9, [r2, #0x4] ; load 4 ref pixels (2A) + ldr r10, [r0, #0x8] ; load 4 src pixels (2B) + ldr r11, [r0, #0xC] ; load 4 src pixels (2B) + + usada8 r4, r6, r8, r4 ; calculate sad for 4 pixels + usad8 r8, r7, r9 ; calculate sad for 4 pixels + + ldr r12, [r2, #0x8] ; load 4 ref pixels (2B) + ldr lr, [r2, #0xC] ; load 4 ref pixels (2B) + + add r0, r0, r1 ; set src pointer to next row + add r2, r2, r3 ; set dst pointer to next row + + usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels + usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels + + subs r5, r5, #1 ; decrement loop counter + add r4, r4, r8 ; add partial sad values + + bne loop + + mov r0, r4 ; return sad + ldmfd sp!, {r4-r12, pc} + + ENDP + + END + diff --git a/vp8/encoder/arm/variance_arm.h b/vp8/encoder/arm/variance_arm.h index 06d7287..c807e29 100644 --- a/vp8/encoder/arm/variance_arm.h +++ b/vp8/encoder/arm/variance_arm.h @@ -14,11 +14,15 @@ #if HAVE_ARMV6 +extern prototype_sad(vp8_sad16x16_armv6); extern prototype_variance(vp8_variance16x16_armv6); extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_armv6); #if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_variance_sad16x16 +#define vp8_variance_sad16x16 vp8_sad16x16_armv6 + #undef vp8_variance_subpixvar16x16 #define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_armv6 diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk index abc5dc8..7980a0f 100644 --- a/vp8/vp8cx_arm.mk +++ b/vp8/vp8cx_arm.mk @@ -34,6 +34,7 @@ VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/armv5te/vp8_packtokens_partitions_ar #File list for armv6 # encoder +VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_sad16x16_armv6$(ASM) VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM) VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/walsh_v6$(ASM) -- 2.7.4