From 56f5a9a060d4c89a71616a90207327e6c544f543 Mon Sep 17 00:00:00 2001 From: Johann Date: Fri, 23 Jul 2010 13:42:30 -0400 Subject: [PATCH] update arm idct functions Jeff Muizelaar posted some changes to the idct/reconstruction c code. This is the equivalent update for the arm assembly. This shows a good boost on v6, and a minor boost on neon. Here are some numbers for highway in qcif, 2641 frames: HEAD neon: ~161 fps new neon: ~162 fps HEAD v6: ~102 fps new v6: ~106 fps The following functions have been updated for armv6 and neon: vp8_dc_only_idct_add vp8_dequant_idct_add vp8_dequant_dc_idct_add Conflicts: vp8/decoder/arm/armv6/dequantdcidct_v6.asm vp8/decoder/arm/armv6/dequantidct_v6.asm Resolved by removing these files. When I rewrote the functions, I also moved the files to dequant_dc_idct_v6.asm/dequant_idct_v6.asm Change-Id: Ie3300df824d52474eca1a5134cf22d8b7809a5d4 --- vp8/common/arm/armv6/dc_only_idct_add_v6.asm | 67 +++++++ vp8/common/arm/armv6/idct_v6.asm | 32 --- vp8/common/arm/armv6/iwalsh_v6.asm | 16 +- vp8/common/arm/idct_arm.h | 16 +- vp8/common/arm/neon/dc_only_idct_add_neon.asm | 49 +++++ vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm | 218 +++++++++++++++++++++ vp8/decoder/arm/armv6/dequant_idct_v6.asm | 196 ++++++++++++++++++ vp8/decoder/arm/armv6/dequantdcidct_v6.asm | 203 ------------------- vp8/decoder/arm/armv6/dequantidct_v6.asm | 184 ----------------- vp8/decoder/arm/dequantize_arm.h | 18 ++ ...antdcidct_neon.asm => dequant_dc_idct_neon.asm} | 72 +++---- ...{dequantidct_neon.asm => dequant_idct_neon.asm} | 67 +++---- vp8/decoder/decodframe.c | 6 +- vp8/decoder/dequantize.c | 23 ++- vp8/decoder/dequantize.h | 12 +- vp8/decoder/generic/dsystemdependent.c | 2 +- vp8/decoder/x86/dequantize_x86.h | 8 +- vp8/decoder/x86/x86_dsystemdependent.c | 2 +- vp8/vp8_common.mk | 3 + vp8/vp8dx_arm.mk | 8 +- 20 files changed, 675 insertions(+), 527 deletions(-) create mode 100644 vp8/common/arm/armv6/dc_only_idct_add_v6.asm create mode 100644 vp8/common/arm/neon/dc_only_idct_add_neon.asm create mode 100644 vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm create mode 100644 vp8/decoder/arm/armv6/dequant_idct_v6.asm delete mode 100644 vp8/decoder/arm/armv6/dequantdcidct_v6.asm delete mode 100644 vp8/decoder/arm/armv6/dequantidct_v6.asm rename vp8/decoder/arm/neon/{dequantdcidct_neon.asm => dequant_dc_idct_neon.asm} (65%) rename vp8/decoder/arm/neon/{dequantidct_neon.asm => dequant_idct_neon.asm} (66%) diff --git a/vp8/common/arm/armv6/dc_only_idct_add_v6.asm b/vp8/common/arm/armv6/dc_only_idct_add_v6.asm new file mode 100644 index 0000000..1922728 --- /dev/null +++ b/vp8/common/arm/armv6/dc_only_idct_add_v6.asm @@ -0,0 +1,67 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + EXPORT |vp8_dc_only_idct_add_v6| + + AREA |.text|, CODE, READONLY + +;void vp8_dc_only_idct_add_v6(short input_dc, unsigned char *pred_ptr, +; unsigned char *dst_ptr, int pitch, int stride) +; r0 input_dc +; r1 pred_ptr +; r2 dest_ptr +; r3 pitch +; sp stride + +|vp8_dc_only_idct_add_v6| PROC + stmdb sp!, {r4 - r7, lr} + + add r0, r0, #4 ; input_dc += 4 + ldr r12, c0x0000FFFF + ldr r4, [r1], r3 + ldr r6, [r1], r3 + and r0, r12, r0, asr #3 ; input_dc >> 3 + mask + ldr lr, [sp, #20] + orr r0, r0, r0, lsl #16 ; a1 | a1 + + uxtab16 r5, r0, r4 ; a1+2 | a1+0 + uxtab16 r4, r0, r4, ror #8 ; a1+3 | a1+1 + uxtab16 r7, r0, r6 + uxtab16 r6, r0, r6, ror #8 + usat16 r5, #8, r5 + usat16 r4, #8, r4 + usat16 r7, #8, r7 + usat16 r6, #8, r6 + orr r5, r5, r4, lsl #8 + orr r7, r7, r6, lsl #8 + ldr r4, [r1], r3 + ldr r6, [r1] + str r5, [r2], lr + str r7, [r2], lr + + uxtab16 r5, r0, r4 + uxtab16 r4, r0, r4, ror #8 + uxtab16 r7, r0, r6 + uxtab16 r6, r0, r6, ror #8 + usat16 r5, #8, r5 + usat16 r4, #8, r4 + usat16 r7, #8, r7 + usat16 r6, #8, r6 + orr r5, r5, r4, lsl #8 + orr r7, r7, r6, lsl #8 + str r5, [r2], lr + str r7, [r2] + + ldmia sp!, {r4 - r7, pc} + + ENDP ; |vp8_dc_only_idct_add_v6| + +; Constant Pool +c0x0000FFFF DCD 0x0000FFFF + END diff --git a/vp8/common/arm/armv6/idct_v6.asm b/vp8/common/arm/armv6/idct_v6.asm index d9913c7..d96908c 100644 --- a/vp8/common/arm/armv6/idct_v6.asm +++ b/vp8/common/arm/armv6/idct_v6.asm @@ -15,8 +15,6 @@ EXPORT |vp8_short_idct4x4llm_v6_scott| EXPORT |vp8_short_idct4x4llm_v6_dual| - EXPORT |vp8_dc_only_idct_armv6| - AREA |.text|, CODE, READONLY ;******************************************************************************** @@ -344,34 +342,4 @@ loop2_dual ldmia sp!, {r4 - r11, pc} ; replace vars, return restore ENDP - -; sjl added 10/17/08 -;void dc_only_idct_armv6(short input_dc, short *output, int pitch) -|vp8_dc_only_idct_armv6| PROC - stmdb sp!, {r4 - r6, lr} - - add r0, r0, #0x4 - add r4, r1, r2 ; output + shortpitch - mov r0, r0, ASR #0x3 ;aka a1 - add r5, r1, r2, LSL #1 ; output + shortpitch * 2 - pkhbt r0, r0, r0, lsl #16 ; a1 | a1 - add r6, r5, r2 ; output + shortpitch * 3 - - str r0, [r1, #0] - str r0, [r1, #4] - - str r0, [r4, #0] - str r0, [r4, #4] - - str r0, [r5, #0] - str r0, [r5, #4] - - str r0, [r6, #0] - str r0, [r6, #4] - - - ldmia sp!, {r4 - r6, pc} - - ENDP ; |vp8_dc_only_idct_armv6| - END diff --git a/vp8/common/arm/armv6/iwalsh_v6.asm b/vp8/common/arm/armv6/iwalsh_v6.asm index f4002b2..cab6bc9 100644 --- a/vp8/common/arm/armv6/iwalsh_v6.asm +++ b/vp8/common/arm/armv6/iwalsh_v6.asm @@ -8,8 +8,8 @@ ; be found in the AUTHORS file in the root of the source tree. ; - EXPORT |vp8_short_inv_walsh4x4_armv6| - EXPORT |vp8_short_inv_walsh4x4_1_armv6| + EXPORT |vp8_short_inv_walsh4x4_v6| + EXPORT |vp8_short_inv_walsh4x4_1_v6| ARM REQUIRE8 @@ -17,8 +17,8 @@ AREA |.text|, CODE, READONLY ; name this block of code -;short vp8_short_inv_walsh4x4_armv6(short *input, short *output) -|vp8_short_inv_walsh4x4_armv6| PROC +;short vp8_short_inv_walsh4x4_v6(short *input, short *output) +|vp8_short_inv_walsh4x4_v6| PROC stmdb sp!, {r4 - r11, lr} @@ -123,11 +123,11 @@ str r5, [r1] ldmia sp!, {r4 - r11, pc} - ENDP ; |vp8_short_inv_walsh4x4_armv6| + ENDP ; |vp8_short_inv_walsh4x4_v6| -;short vp8_short_inv_walsh4x4_1_armv6(short *input, short *output) -|vp8_short_inv_walsh4x4_1_armv6| PROC +;short vp8_short_inv_walsh4x4_1_v6(short *input, short *output) +|vp8_short_inv_walsh4x4_1_v6| PROC ldrsh r2, [r0] ; [0] add r2, r2, #3 ; [0] + 3 @@ -145,7 +145,7 @@ str r2, [r1] bx lr - ENDP ; |vp8_short_inv_walsh4x4_1_armv6| + ENDP ; |vp8_short_inv_walsh4x4_1_v6| ; Constant Pool c0x00030003 DCD 0x00030003 diff --git a/vp8/common/arm/idct_arm.h b/vp8/common/arm/idct_arm.h index 97af32e..6d917c4 100644 --- a/vp8/common/arm/idct_arm.h +++ b/vp8/common/arm/idct_arm.h @@ -15,8 +15,9 @@ #if HAVE_ARMV6 extern prototype_idct(vp8_short_idct4x4llm_1_v6); extern prototype_idct(vp8_short_idct4x4llm_v6_dual); -extern prototype_second_order(vp8_short_inv_walsh4x4_1_armv6); -extern prototype_second_order(vp8_short_inv_walsh4x4_armv6); +extern prototype_idct_scalar_add(vp8_dc_only_idct_add_v6); +extern prototype_second_order(vp8_short_inv_walsh4x4_1_v6); +extern prototype_second_order(vp8_short_inv_walsh4x4_v6); #undef vp8_idct_idct1 #define vp8_idct_idct1 vp8_short_idct4x4llm_1_v6 @@ -24,16 +25,20 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_armv6); #undef vp8_idct_idct16 #define vp8_idct_idct16 vp8_short_idct4x4llm_v6_dual +#undef vp8_idct_idct1_scalar_add +#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_v6 + #undef vp8_idct_iwalsh1 -#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_armv6 +#define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_v6 #undef vp8_idct_iwalsh16 -#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_armv6 +#define vp8_idct_iwalsh16 vp8_short_inv_walsh4x4_v6 #endif #if HAVE_ARMV7 extern prototype_idct(vp8_short_idct4x4llm_1_neon); extern prototype_idct(vp8_short_idct4x4llm_neon); +extern prototype_idct_scalar_add(vp8_dc_only_idct_add_neon); extern prototype_second_order(vp8_short_inv_walsh4x4_1_neon); extern prototype_second_order(vp8_short_inv_walsh4x4_neon); @@ -43,6 +48,9 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_neon); #undef vp8_idct_idct16 #define vp8_idct_idct16 vp8_short_idct4x4llm_neon +#undef vp8_idct_idct1_scalar_add +#define vp8_idct_idct1_scalar_add vp8_dc_only_idct_add_neon + #undef vp8_idct_iwalsh1 #define vp8_idct_iwalsh1 vp8_short_inv_walsh4x4_1_neon diff --git a/vp8/common/arm/neon/dc_only_idct_add_neon.asm b/vp8/common/arm/neon/dc_only_idct_add_neon.asm new file mode 100644 index 0000000..e6f141f --- /dev/null +++ b/vp8/common/arm/neon/dc_only_idct_add_neon.asm @@ -0,0 +1,49 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_dc_only_idct_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr, +; unsigned char *dst_ptr, int pitch, int stride) +; r0 input_dc +; r1 pred_ptr +; r2 dst_ptr +; r3 pitch +; sp stride +|vp8_dc_only_idct_add_neon| PROC + add r0, r0, #4 + asr r0, r0, #3 + ldr r12, [sp] + vdup.16 q0, r0 + + vld1.32 {d2[0]}, [r1], r3 + vld1.32 {d2[1]}, [r1], r3 + vld1.32 {d4[0]}, [r1], r3 + vld1.32 {d4[1]}, [r1] + + vaddw.u8 q1, q0, d2 + vaddw.u8 q2, q0, d4 + + vqmovun.s16 d2, q1 + vqmovun.s16 d4, q2 + + vst1.32 {d2[0]}, [r2], r12 + vst1.32 {d2[1]}, [r2], r12 + vst1.32 {d4[0]}, [r2], r12 + vst1.32 {d4[1]}, [r2] + + bx lr + + ENDP + END diff --git a/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm b/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm new file mode 100644 index 0000000..886873c --- /dev/null +++ b/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm @@ -0,0 +1,218 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_dequant_dc_idct_add_v6| + + AREA |.text|, CODE, READONLY + +;void vp8_dequant_dc_idct_v6(short *input, short *dq, unsigned char *pred, +; unsigned char *dest, int pitch, int stride, int Dc) +; r0 = input +; r1 = dq +; r2 = pred +; r3 = dest +; sp + 36 = pitch ; +4 = 40 +; sp + 40 = stride ; +4 = 44 +; sp + 44 = Dc ; +4 = 48 + + +|vp8_dequant_dc_idct_add_v6| PROC + stmdb sp!, {r4-r11, lr} + + ldr r6, [sp, #44] + + ldr r4, [r0] ;input + ldr r5, [r1], #4 ;dq + + sub sp, sp, #4 + str r3, [sp] + + smultt r7, r4, r5 + + ldr r4, [r0, #4] ;input + ldr r5, [r1], #4 ;dq + + strh r6, [r0], #2 + strh r7, [r0], #2 + + smulbb r6, r4, r5 + smultt r7, r4, r5 + + ldr r4, [r0, #4] ;input + ldr r5, [r1], #4 ;dq + + strh r6, [r0], #2 + strh r7, [r0], #2 + + mov r12, #3 + +vp8_dequant_dc_add_loop + smulbb r6, r4, r5 + smultt r7, r4, r5 + + ldr r4, [r0, #4] ;input + ldr r5, [r1], #4 ;dq + + strh r6, [r0], #2 + strh r7, [r0], #2 + + smulbb r6, r4, r5 + smultt r7, r4, r5 + + subs r12, r12, #1 + + ldrne r4, [r0, #4] + ldrne r5, [r1], #4 + + strh r6, [r0], #2 + strh r7, [r0], #2 + + bne vp8_dequant_dc_add_loop + + sub r0, r0, #32 + mov r1, r0 + +; short_idct4x4llm_v6_dual + ldr r3, cospi8sqrt2minus1 + ldr r4, sinpi8sqrt2 + ldr r6, [r0, #8] + mov r5, #2 +vp8_dequant_dc_idct_loop1_v6 + ldr r12, [r0, #24] + ldr r14, [r0, #16] + smulwt r9, r3, r6 + smulwb r7, r3, r6 + smulwt r10, r4, r6 + smulwb r8, r4, r6 + pkhbt r7, r7, r9, lsl #16 + smulwt r11, r3, r12 + pkhbt r8, r8, r10, lsl #16 + uadd16 r6, r6, r7 + smulwt r7, r4, r12 + smulwb r9, r3, r12 + smulwb r10, r4, r12 + subs r5, r5, #1 + pkhbt r9, r9, r11, lsl #16 + ldr r11, [r0], #4 + pkhbt r10, r10, r7, lsl #16 + uadd16 r7, r12, r9 + usub16 r7, r8, r7 + uadd16 r6, r6, r10 + uadd16 r10, r11, r14 + usub16 r8, r11, r14 + uadd16 r9, r10, r6 + usub16 r10, r10, r6 + uadd16 r6, r8, r7 + usub16 r7, r8, r7 + str r6, [r1, #8] + ldrne r6, [r0, #8] + str r7, [r1, #16] + str r10, [r1, #24] + str r9, [r1], #4 + bne vp8_dequant_dc_idct_loop1_v6 + + mov r5, #2 + sub r0, r1, #8 +vp8_dequant_dc_idct_loop2_v6 + ldr r6, [r0], #4 + ldr r7, [r0], #4 + ldr r8, [r0], #4 + ldr r9, [r0], #4 + smulwt r1, r3, r6 + smulwt r12, r4, r6 + smulwt lr, r3, r8 + smulwt r10, r4, r8 + pkhbt r11, r8, r6, lsl #16 + pkhbt r1, lr, r1, lsl #16 + pkhbt r12, r10, r12, lsl #16 + pkhtb r6, r6, r8, asr #16 + uadd16 r6, r1, r6 + pkhbt lr, r9, r7, lsl #16 + uadd16 r10, r11, lr + usub16 lr, r11, lr + pkhtb r8, r7, r9, asr #16 + subs r5, r5, #1 + smulwt r1, r3, r8 + smulwb r7, r3, r8 + smulwt r11, r4, r8 + smulwb r9, r4, r8 + pkhbt r1, r7, r1, lsl #16 + uadd16 r8, r1, r8 + pkhbt r11, r9, r11, lsl #16 + usub16 r1, r12, r8 + uadd16 r8, r11, r6 + ldr r9, c0x00040004 + ldr r12, [sp, #40] + uadd16 r6, r10, r8 + usub16 r7, r10, r8 + uadd16 r7, r7, r9 + uadd16 r6, r6, r9 + uadd16 r10, r14, r1 + usub16 r1, r14, r1 + uadd16 r10, r10, r9 + uadd16 r1, r1, r9 + ldr r11, [r2], r12 + mov r8, r7, asr #3 + pkhtb r9, r8, r10, asr #19 + mov r8, r1, asr #3 + pkhtb r8, r8, r6, asr #19 + uxtb16 lr, r11, ror #8 + qadd16 r9, r9, lr + uxtb16 lr, r11 + qadd16 r8, r8, lr + usat16 r9, #8, r9 + usat16 r8, #8, r8 + orr r9, r8, r9, lsl #8 + ldr r11, [r2], r12 + ldr lr, [sp] + ldr r12, [sp, #44] + mov r7, r7, lsl #16 + mov r1, r1, lsl #16 + mov r10, r10, lsl #16 + mov r6, r6, lsl #16 + mov r7, r7, asr #3 + pkhtb r7, r7, r10, asr #19 + mov r1, r1, asr #3 + pkhtb r1, r1, r6, asr #19 + uxtb16 r8, r11, ror #8 + qadd16 r7, r7, r8 + uxtb16 r8, r11 + qadd16 r1, r1, r8 + usat16 r7, #8, r7 + usat16 r1, #8, r1 + orr r1, r1, r7, lsl #8 + str r9, [lr], r12 + str r1, [lr], r12 + str lr, [sp] + bne vp8_dequant_dc_idct_loop2_v6 + +; vpx_memset + sub r0, r0, #32 + add sp, sp, #4 + + mov r12, #0 + str r12, [r0] + str r12, [r0, #4] + str r12, [r0, #8] + str r12, [r0, #12] + str r12, [r0, #16] + str r12, [r0, #20] + str r12, [r0, #24] + str r12, [r0, #28] + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_dequant_dc_idct_add_v6| + +; Constant Pool +cospi8sqrt2minus1 DCD 0x00004E7B +sinpi8sqrt2 DCD 0x00008A8C +c0x00040004 DCD 0x00040004 + + END diff --git a/vp8/decoder/arm/armv6/dequant_idct_v6.asm b/vp8/decoder/arm/armv6/dequant_idct_v6.asm new file mode 100644 index 0000000..c13b512 --- /dev/null +++ b/vp8/decoder/arm/armv6/dequant_idct_v6.asm @@ -0,0 +1,196 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + EXPORT |vp8_dequant_idct_add_v6| + + AREA |.text|, CODE, READONLY +;void vp8_dequant_idct_v6(short *input, short *dq, unsigned char *pred, +; unsigned char *dest, int pitch, int stride) +; r0 = input +; r1 = dq +; r2 = pred +; r3 = dest +; sp + 36 = pitch ; +4 = 40 +; sp + 40 = stride ; +4 = 44 + + +|vp8_dequant_idct_add_v6| PROC + stmdb sp!, {r4-r11, lr} + + ldr r4, [r0] ;input + ldr r5, [r1], #4 ;dq + + sub sp, sp, #4 + str r3, [sp] + + mov r12, #4 + +vp8_dequant_add_loop + smulbb r6, r4, r5 + smultt r7, r4, r5 + + ldr r4, [r0, #4] ;input + ldr r5, [r1], #4 ;dq + + strh r6, [r0], #2 + strh r7, [r0], #2 + + smulbb r6, r4, r5 + smultt r7, r4, r5 + + subs r12, r12, #1 + + ldrne r4, [r0, #4] + ldrne r5, [r1], #4 + + strh r6, [r0], #2 + strh r7, [r0], #2 + + bne vp8_dequant_add_loop + + sub r0, r0, #32 + mov r1, r0 + +; short_idct4x4llm_v6_dual + ldr r3, cospi8sqrt2minus1 + ldr r4, sinpi8sqrt2 + ldr r6, [r0, #8] + mov r5, #2 +vp8_dequant_idct_loop1_v6 + ldr r12, [r0, #24] + ldr r14, [r0, #16] + smulwt r9, r3, r6 + smulwb r7, r3, r6 + smulwt r10, r4, r6 + smulwb r8, r4, r6 + pkhbt r7, r7, r9, lsl #16 + smulwt r11, r3, r12 + pkhbt r8, r8, r10, lsl #16 + uadd16 r6, r6, r7 + smulwt r7, r4, r12 + smulwb r9, r3, r12 + smulwb r10, r4, r12 + subs r5, r5, #1 + pkhbt r9, r9, r11, lsl #16 + ldr r11, [r0], #4 + pkhbt r10, r10, r7, lsl #16 + uadd16 r7, r12, r9 + usub16 r7, r8, r7 + uadd16 r6, r6, r10 + uadd16 r10, r11, r14 + usub16 r8, r11, r14 + uadd16 r9, r10, r6 + usub16 r10, r10, r6 + uadd16 r6, r8, r7 + usub16 r7, r8, r7 + str r6, [r1, #8] + ldrne r6, [r0, #8] + str r7, [r1, #16] + str r10, [r1, #24] + str r9, [r1], #4 + bne vp8_dequant_idct_loop1_v6 + + mov r5, #2 + sub r0, r1, #8 +vp8_dequant_idct_loop2_v6 + ldr r6, [r0], #4 + ldr r7, [r0], #4 + ldr r8, [r0], #4 + ldr r9, [r0], #4 + smulwt r1, r3, r6 + smulwt r12, r4, r6 + smulwt lr, r3, r8 + smulwt r10, r4, r8 + pkhbt r11, r8, r6, lsl #16 + pkhbt r1, lr, r1, lsl #16 + pkhbt r12, r10, r12, lsl #16 + pkhtb r6, r6, r8, asr #16 + uadd16 r6, r1, r6 + pkhbt lr, r9, r7, lsl #16 + uadd16 r10, r11, lr + usub16 lr, r11, lr + pkhtb r8, r7, r9, asr #16 + subs r5, r5, #1 + smulwt r1, r3, r8 + smulwb r7, r3, r8 + smulwt r11, r4, r8 + smulwb r9, r4, r8 + pkhbt r1, r7, r1, lsl #16 + uadd16 r8, r1, r8 + pkhbt r11, r9, r11, lsl #16 + usub16 r1, r12, r8 + uadd16 r8, r11, r6 + ldr r9, c0x00040004 + ldr r12, [sp, #40] + uadd16 r6, r10, r8 + usub16 r7, r10, r8 + uadd16 r7, r7, r9 + uadd16 r6, r6, r9 + uadd16 r10, r14, r1 + usub16 r1, r14, r1 + uadd16 r10, r10, r9 + uadd16 r1, r1, r9 + ldr r11, [r2], r12 + mov r8, r7, asr #3 + pkhtb r9, r8, r10, asr #19 + mov r8, r1, asr #3 + pkhtb r8, r8, r6, asr #19 + uxtb16 lr, r11, ror #8 + qadd16 r9, r9, lr + uxtb16 lr, r11 + qadd16 r8, r8, lr + usat16 r9, #8, r9 + usat16 r8, #8, r8 + orr r9, r8, r9, lsl #8 + ldr r11, [r2], r12 + ldr lr, [sp] + ldr r12, [sp, #44] + mov r7, r7, lsl #16 + mov r1, r1, lsl #16 + mov r10, r10, lsl #16 + mov r6, r6, lsl #16 + mov r7, r7, asr #3 + pkhtb r7, r7, r10, asr #19 + mov r1, r1, asr #3 + pkhtb r1, r1, r6, asr #19 + uxtb16 r8, r11, ror #8 + qadd16 r7, r7, r8 + uxtb16 r8, r11 + qadd16 r1, r1, r8 + usat16 r7, #8, r7 + usat16 r1, #8, r1 + orr r1, r1, r7, lsl #8 + str r9, [lr], r12 + str r1, [lr], r12 + str lr, [sp] + bne vp8_dequant_idct_loop2_v6 + +; vpx_memset + sub r0, r0, #32 + add sp, sp, #4 + + mov r12, #0 + str r12, [r0] + str r12, [r0, #4] + str r12, [r0, #8] + str r12, [r0, #12] + str r12, [r0, #16] + str r12, [r0, #20] + str r12, [r0, #24] + str r12, [r0, #28] + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_dequant_idct_add_v6| + +; Constant Pool +cospi8sqrt2minus1 DCD 0x00004E7B +sinpi8sqrt2 DCD 0x00008A8C +c0x00040004 DCD 0x00040004 + + END diff --git a/vp8/decoder/arm/armv6/dequantdcidct_v6.asm b/vp8/decoder/arm/armv6/dequantdcidct_v6.asm deleted file mode 100644 index 0252872..0000000 --- a/vp8/decoder/arm/armv6/dequantdcidct_v6.asm +++ /dev/null @@ -1,203 +0,0 @@ -; -; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_dequant_dc_idct_v6| - ; ARM - ; REQUIRE8 - ; PRESERVE8 - - AREA |.text|, CODE, READONLY ; name this block of code -;void vp8_dequant_dc_idct_v6(short *input, short *dq, short *output, int pitch,int Dc) -|vp8_dequant_dc_idct_v6| PROC - stmdb sp!, {r4-r11, lr} - - ldr r6, [sp, #36] ;load Dc - - ldr r4, [r0] ;input - ldr r5, [r1], #4 ;dq - - sub sp, sp, #4 - str r0, [sp] - - smultt r7, r4, r5 - - ldr r4, [r0, #4] ;input - ldr r5, [r1], #4 ;dq - - strh r6, [r0], #2 - strh r7, [r0], #2 - - smulbb r6, r4, r5 - smultt r7, r4, r5 - - ldr r4, [r0, #4] ;input - ldr r5, [r1], #4 ;dq - - strh r6, [r0], #2 - strh r7, [r0], #2 - - mov r12, #3 - -dequant_dc_idct_loop - smulbb r6, r4, r5 - smultt r7, r4, r5 - - ldr r4, [r0, #4] ;input - ldr r5, [r1], #4 ;dq - - strh r6, [r0], #2 - strh r7, [r0], #2 - - smulbb r6, r4, r5 - smultt r7, r4, r5 - - subs r12, r12, #1 - - ldrne r4, [r0, #4] - ldrne r5, [r1], #4 - - strh r6, [r0], #2 - strh r7, [r0], #2 - - bne dequant_dc_idct_loop - - sub r0, r0, #32 - mov r1, r2 - mov r2, r3 - -; short_idct4x4llm_v6_dual - - mov r3, #0x00004E00 ; cos - orr r3, r3, #0x0000007B ; cospi8sqrt2minus1 - mov r4, #0x00008A00 ; sin - orr r4, r4, #0x0000008C ; sinpi8sqrt2 - mov r5, #0x2 ; i=2 i -loop1_dual_11 - ldr r6, [r0, #(4*2)] ; i5 | i4 5|4 - ldr r12, [r0, #(12*2)] ; i13 | i12 13|12 - ldr r14, [r0, #(8*2)] ; i9 | i8 9|8 - - smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c - smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c - smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s - smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s - pkhbt r7, r7, r9, lsl #16 ; 5c | 4c - smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c - pkhbt r8, r8, r10, lsl #16 ; 5s | 4s - uadd16 r6, r6, r7 ; 5c+5 | 4c+4 - smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s - smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c - smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s - subs r5, r5, #0x1 ; i-- -- - pkhbt r9, r9, r11, lsl #16 ; 13c | 12c - ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0 - pkhbt r10, r10, r7, lsl #16 ; 13s | 12s - uadd16 r7, r12, r9 ; 13c+13 | 12c+12 - usub16 r7, r8, r7 ; c c - uadd16 r6, r6, r10 ; d d - uadd16 r10, r11, r14 ; a a - usub16 r8, r11, r14 ; b b - uadd16 r9, r10, r6 ; a+d a+d - usub16 r10, r10, r6 ; a-d a-d - uadd16 r6, r8, r7 ; b+c b+c - usub16 r7, r8, r7 ; b-c b-c - str r6, [r1, r2] ; o5 | o4 - add r6, r2, r2 ; pitch * 2 p2 - str r7, [r1, r6] ; o9 | o8 - add r6, r6, r2 ; pitch * 3 p3 - str r10, [r1, r6] ; o13 | o12 - str r9, [r1], #0x4 ; o1 | o0 ++ - bne loop1_dual_11 ; - mov r5, #0x2 ; i=2 i - sub r0, r1, #8 ; reset input/output i/o -loop2_dual_22 - ldr r6, [r0, r2] ; i5 | i4 5|4 - ldr r1, [r0] ; i1 | i0 1|0 - ldr r12, [r0, #0x4] ; i3 | i2 3|2 - add r14, r2, #0x4 ; pitch + 2 p+2 - ldr r14, [r0, r14] ; i7 | i6 7|6 - smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c - smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c - smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s - smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s - pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4 - pkhbt r7, r9, r7, lsl #16 ; 1c | 5c - pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 © tc1 - pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5 - uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2 - pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6 - uadd16 r10, r11, r9 ; a a - usub16 r9, r11, r9 ; b b - pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7 - subs r5, r5, #0x1 ; i-- -- - smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c - smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s - smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c - smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s - - pkhbt r7, r12, r7, lsl #16 ; 3c | 7c - pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1 - uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2 - usub16 r12, r8, r6 ; c (o1 | o5) c - uadd16 r6, r11, r1 ; d (o3 | o7) d - uadd16 r7, r10, r6 ; a+d a+d - mov r8, #0x4 ; set up 4's 4 - orr r8, r8, #0x40000 ; 4|4 - usub16 r6, r10, r6 ; a-d a-d - uadd16 r6, r6, r8 ; a-d+4 3|7 - uadd16 r7, r7, r8 ; a+d+4 0|4 - uadd16 r10, r9, r12 ; b+c b+c - usub16 r1, r9, r12 ; b-c b-c - uadd16 r10, r10, r8 ; b+c+4 1|5 - uadd16 r1, r1, r8 ; b-c+4 2|6 - mov r8, r10, asr #19 ; o1 >> 3 - strh r8, [r0, #2] ; o1 - mov r8, r1, asr #19 ; o2 >> 3 - strh r8, [r0, #4] ; o2 - mov r8, r6, asr #19 ; o3 >> 3 - strh r8, [r0, #6] ; o3 - mov r8, r7, asr #19 ; o0 >> 3 - strh r8, [r0], r2 ; o0 +p - sxth r10, r10 ; - mov r8, r10, asr #3 ; o5 >> 3 - strh r8, [r0, #2] ; o5 - sxth r1, r1 ; - mov r8, r1, asr #3 ; o6 >> 3 - strh r8, [r0, #4] ; o6 - sxth r6, r6 ; - mov r8, r6, asr #3 ; o7 >> 3 - strh r8, [r0, #6] ; o7 - sxth r7, r7 ; - mov r8, r7, asr #3 ; o4 >> 3 - strh r8, [r0], r2 ; o4 +p -;;;;; subs r5, r5, #0x1 ; i-- -- - bne loop2_dual_22 ; - - -;vpx_memset - ldr r0, [sp] - add sp, sp, #4 - - mov r12, #0 - str r12, [r0] - str r12, [r0, #4] - str r12, [r0, #8] - str r12, [r0, #12] - str r12, [r0, #16] - str r12, [r0, #20] - str r12, [r0, #24] - str r12, [r0, #28] - - ldmia sp!, {r4 - r11, pc} ; replace vars, return restore - - ENDP ;|vp8_dequant_dc_idct_v68| - - END diff --git a/vp8/decoder/arm/armv6/dequantidct_v6.asm b/vp8/decoder/arm/armv6/dequantidct_v6.asm deleted file mode 100644 index 15e4c68..0000000 --- a/vp8/decoder/arm/armv6/dequantidct_v6.asm +++ /dev/null @@ -1,184 +0,0 @@ -; -; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_dequant_idct_v6| - ; ARM - ; REQUIRE8 - ; PRESERVE8 - - AREA |.text|, CODE, READONLY ; name this block of code -;void vp8_dequant_idct_v6(short *input, short *dq, short *output, int pitch) -|vp8_dequant_idct_v6| PROC - stmdb sp!, {r4-r11, lr} - - ldr r4, [r0] ;input - ldr r5, [r1], #4 ;dq - - sub sp, sp, #4 - str r0, [sp] - - mov r12, #4 - -dequant_idct_loop - smulbb r6, r4, r5 - smultt r7, r4, r5 - - ldr r4, [r0, #4] ;input - ldr r5, [r1], #4 ;dq - - strh r6, [r0], #2 - strh r7, [r0], #2 - - smulbb r6, r4, r5 - smultt r7, r4, r5 - - subs r12, r12, #1 - - ldrne r4, [r0, #4] - ldrne r5, [r1], #4 - - strh r6, [r0], #2 - strh r7, [r0], #2 - - bne dequant_idct_loop - - sub r0, r0, #32 - mov r1, r2 - mov r2, r3 - -; short_idct4x4llm_v6_dual - - mov r3, #0x00004E00 ; cos - orr r3, r3, #0x0000007B ; cospi8sqrt2minus1 - mov r4, #0x00008A00 ; sin - orr r4, r4, #0x0000008C ; sinpi8sqrt2 - mov r5, #0x2 ; i=2 i -loop1_dual_1 - ldr r6, [r0, #(4*2)] ; i5 | i4 5|4 - ldr r12, [r0, #(12*2)] ; i13 | i12 13|12 - ldr r14, [r0, #(8*2)] ; i9 | i8 9|8 - - smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c - smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c - smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s - smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s - pkhbt r7, r7, r9, lsl #16 ; 5c | 4c - smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c - pkhbt r8, r8, r10, lsl #16 ; 5s | 4s - uadd16 r6, r6, r7 ; 5c+5 | 4c+4 - smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s - smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c - smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s - subs r5, r5, #0x1 ; i-- -- - pkhbt r9, r9, r11, lsl #16 ; 13c | 12c - ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0 - pkhbt r10, r10, r7, lsl #16 ; 13s | 12s - uadd16 r7, r12, r9 ; 13c+13 | 12c+12 - usub16 r7, r8, r7 ; c c - uadd16 r6, r6, r10 ; d d - uadd16 r10, r11, r14 ; a a - usub16 r8, r11, r14 ; b b - uadd16 r9, r10, r6 ; a+d a+d - usub16 r10, r10, r6 ; a-d a-d - uadd16 r6, r8, r7 ; b+c b+c - usub16 r7, r8, r7 ; b-c b-c - str r6, [r1, r2] ; o5 | o4 - add r6, r2, r2 ; pitch * 2 p2 - str r7, [r1, r6] ; o9 | o8 - add r6, r6, r2 ; pitch * 3 p3 - str r10, [r1, r6] ; o13 | o12 - str r9, [r1], #0x4 ; o1 | o0 ++ - bne loop1_dual_1 ; - mov r5, #0x2 ; i=2 i - sub r0, r1, #8 ; reset input/output i/o -loop2_dual_2 - ldr r6, [r0, r2] ; i5 | i4 5|4 - ldr r1, [r0] ; i1 | i0 1|0 - ldr r12, [r0, #0x4] ; i3 | i2 3|2 - add r14, r2, #0x4 ; pitch + 2 p+2 - ldr r14, [r0, r14] ; i7 | i6 7|6 - smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c - smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c - smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s - smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s - pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4 - pkhbt r7, r9, r7, lsl #16 ; 1c | 5c - pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 © tc1 - pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5 - uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2 - pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6 - uadd16 r10, r11, r9 ; a a - usub16 r9, r11, r9 ; b b - pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7 - subs r5, r5, #0x1 ; i-- -- - smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c - smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s - smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c - smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s - - pkhbt r7, r12, r7, lsl #16 ; 3c | 7c - pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1 - uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2 - usub16 r12, r8, r6 ; c (o1 | o5) c - uadd16 r6, r11, r1 ; d (o3 | o7) d - uadd16 r7, r10, r6 ; a+d a+d - mov r8, #0x4 ; set up 4's 4 - orr r8, r8, #0x40000 ; 4|4 - usub16 r6, r10, r6 ; a-d a-d - uadd16 r6, r6, r8 ; a-d+4 3|7 - uadd16 r7, r7, r8 ; a+d+4 0|4 - uadd16 r10, r9, r12 ; b+c b+c - usub16 r1, r9, r12 ; b-c b-c - uadd16 r10, r10, r8 ; b+c+4 1|5 - uadd16 r1, r1, r8 ; b-c+4 2|6 - mov r8, r10, asr #19 ; o1 >> 3 - strh r8, [r0, #2] ; o1 - mov r8, r1, asr #19 ; o2 >> 3 - strh r8, [r0, #4] ; o2 - mov r8, r6, asr #19 ; o3 >> 3 - strh r8, [r0, #6] ; o3 - mov r8, r7, asr #19 ; o0 >> 3 - strh r8, [r0], r2 ; o0 +p - sxth r10, r10 ; - mov r8, r10, asr #3 ; o5 >> 3 - strh r8, [r0, #2] ; o5 - sxth r1, r1 ; - mov r8, r1, asr #3 ; o6 >> 3 - strh r8, [r0, #4] ; o6 - sxth r6, r6 ; - mov r8, r6, asr #3 ; o7 >> 3 - strh r8, [r0, #6] ; o7 - sxth r7, r7 ; - mov r8, r7, asr #3 ; o4 >> 3 - strh r8, [r0], r2 ; o4 +p -;;;;; subs r5, r5, #0x1 ; i-- -- - bne loop2_dual_2 ; - ; - -;vpx_memset - ldr r0, [sp] - add sp, sp, #4 - - mov r12, #0 - str r12, [r0] - str r12, [r0, #4] - str r12, [r0, #8] - str r12, [r0, #12] - str r12, [r0, #16] - str r12, [r0, #20] - str r12, [r0, #24] - str r12, [r0, #28] - - ldmia sp!, {r4 - r11, pc} ; replace vars, return restore - - ENDP ;|vp8_dequant_idct_v6| - - END diff --git a/vp8/decoder/arm/dequantize_arm.h b/vp8/decoder/arm/dequantize_arm.h index 78af019..3a044f8 100644 --- a/vp8/decoder/arm/dequantize_arm.h +++ b/vp8/decoder/arm/dequantize_arm.h @@ -14,14 +14,32 @@ #if HAVE_ARMV6 extern prototype_dequant_block(vp8_dequantize_b_v6); +extern prototype_dequant_idct_add(vp8_dequant_idct_add_v6); +extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_v6); #undef vp8_dequant_block #define vp8_dequant_block vp8_dequantize_b_v6 +#undef vp8_dequant_idct_add +#define vp8_dequant_idct_add vp8_dequant_idct_add_v6 + +#undef vp8_dequant_dc_idct_add +#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_v6 +#endif + #if HAVE_ARMV7 extern prototype_dequant_block(vp8_dequantize_b_neon); +extern prototype_dequant_idct_add(vp8_dequant_idct_add_neon); +extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_neon); #undef vp8_dequant_block #define vp8_dequant_block vp8_dequantize_b_neon +#undef vp8_dequant_idct_add +#define vp8_dequant_idct_add vp8_dequant_idct_add_neon + +#undef vp8_dequant_dc_idct_add +#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_neon +#endif + #endif diff --git a/vp8/decoder/arm/neon/dequantdcidct_neon.asm b/vp8/decoder/arm/neon/dequant_dc_idct_neon.asm similarity index 65% rename from vp8/decoder/arm/neon/dequantdcidct_neon.asm rename to vp8/decoder/arm/neon/dequant_dc_idct_neon.asm index ae126f2..ddb3240 100644 --- a/vp8/decoder/arm/neon/dequantdcidct_neon.asm +++ b/vp8/decoder/arm/neon/dequant_dc_idct_neon.asm @@ -9,31 +9,43 @@ ; - EXPORT |vp8_dequant_dc_idct_neon| + EXPORT |vp8_dequant_dc_idct_add_neon| ARM REQUIRE8 PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vp8_dequant_dc_idct_c(short *input, short *dq, short *output, int pitch, int Dc); +;void vp8_dequant_dc_idct_add_neon(short *input, short *dq, unsigned char *pred, +; unsigned char *dest, int pitch, int stride, +; int Dc); ; r0 short *input, ; r1 short *dq, -; r2 short *output, -; r3 int pitch, -; (stack) int Dc -|vp8_dequant_dc_idct_neon| PROC +; r2 unsigned char *pred +; r3 unsigned char *dest +; sp int pitch +; sp+4 int stride +; sp+8 int Dc +|vp8_dequant_dc_idct_add_neon| PROC vld1.16 {q3, q4}, [r0] vld1.16 {q5, q6}, [r1] - ldr r1, [sp] ;load Dc from stack + ldr r1, [sp, #8] ;load Dc from stack - ldr r12, _dcidct_coeff_ + ldr r12, _CONSTANTS_ vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon vmul.i16 q2, q4, q6 vmov.16 d2[0], r1 + ldr r1, [sp] ; pitch + vld1.32 {d14[0]}, [r2], r1 + vld1.32 {d14[1]}, [r2], r1 + vld1.32 {d15[0]}, [r2], r1 + vld1.32 {d15[1]}, [r2] + + ldr r1, [sp, #4] ; stride + ;|short_idct4x4llm_neon| PROC vld1.16 {d0}, [r12] vswp d3, d4 ;q2(vp[4] vp[12]) @@ -47,14 +59,9 @@ vshr.s16 q3, q3, #1 vshr.s16 q4, q4, #1 - vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number) + vqadd.s16 q3, q3, q2 vqadd.s16 q4, q4, q2 - ;d6 - c1:temp1 - ;d7 - d1:temp2 - ;d8 - d1:temp1 - ;d9 - c1:temp2 - vqsub.s16 d10, d6, d9 ;c1 vqadd.s16 d11, d7, d8 ;d1 @@ -83,7 +90,7 @@ vshr.s16 q3, q3, #1 vshr.s16 q4, q4, #1 - vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number) + vqadd.s16 q3, q3, q2 vqadd.s16 q4, q4, q2 vqsub.s16 d10, d6, d9 ;c1 @@ -101,34 +108,29 @@ vrshr.s16 d4, d4, #3 vrshr.s16 d5, d5, #3 - add r1, r2, r3 - add r12, r1, r3 - add r0, r12, r3 - vtrn.32 d2, d4 vtrn.32 d3, d5 vtrn.16 d2, d3 vtrn.16 d4, d5 - vst1.16 {d2}, [r2] - vst1.16 {d3}, [r1] - vst1.16 {d4}, [r12] - vst1.16 {d5}, [r0] + vaddw.u8 q1, q1, d14 + vaddw.u8 q2, q2, d15 - bx lr + vqmovun.s16 d0, q1 + vqmovun.s16 d1, q2 + + vst1.32 {d0[0]}, [r3], r1 + vst1.32 {d0[1]}, [r3], r1 + vst1.32 {d1[0]}, [r3], r1 + vst1.32 {d1[1]}, [r3] - ENDP + bx lr -;----------------- - AREA dcidct4x4_dat, DATA, READWRITE ;read/write by default -;Data section with name data_area is specified. DCD reserves space in memory for 48 data. -;One word each is reserved. Label filter_coeff can be used to access the data. -;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... -_dcidct_coeff_ - DCD dcidct_coeff -dcidct_coeff - DCD 0x4e7b4e7b, 0x8a8c8a8c + ENDP ; |vp8_dequant_dc_idct_add_neon| -;20091, 20091, 35468, 35468 +; Constant Pool +_CONSTANTS_ DCD cospi8sqrt2minus1 +cospi8sqrt2minus1 DCD 0x4e7b4e7b +sinpi8sqrt2 DCD 0x8a8c8a8c END diff --git a/vp8/decoder/arm/neon/dequantidct_neon.asm b/vp8/decoder/arm/neon/dequant_idct_neon.asm similarity index 66% rename from vp8/decoder/arm/neon/dequantidct_neon.asm rename to vp8/decoder/arm/neon/dequant_idct_neon.asm index e0888ed..5c60dd6 100644 --- a/vp8/decoder/arm/neon/dequantidct_neon.asm +++ b/vp8/decoder/arm/neon/dequant_idct_neon.asm @@ -9,22 +9,33 @@ ; - EXPORT |vp8_dequant_idct_neon| + EXPORT |vp8_dequant_idct_add_neon| ARM REQUIRE8 PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vp8_dequant_idct_c(short *input, short *dq, short *output, int pitch); +;void vp8_dequant_idct_neon(short *input, short *dq, unsigned char *pred, +; unsigned char *dest, int pitch, int stride) ; r0 short *input, ; r1 short *dq, -; r2 short *output, -; r3 int pitch, -|vp8_dequant_idct_neon| PROC +; r2 unsigned char *pred +; r3 unsigned char *dest +; sp int pitch +; sp+4 int stride + +|vp8_dequant_idct_add_neon| PROC vld1.16 {q3, q4}, [r0] vld1.16 {q5, q6}, [r1] + ldr r1, [sp] ; pitch + vld1.32 {d14[0]}, [r2], r1 + vld1.32 {d14[1]}, [r2], r1 + vld1.32 {d15[0]}, [r2], r1 + vld1.32 {d15[1]}, [r2] + + ldr r1, [sp, #4] ; stride - ldr r12, _didct_coeff_ + ldr r12, _CONSTANTS_ vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon vmul.i16 q2, q4, q6 @@ -42,14 +53,9 @@ vshr.s16 q3, q3, #1 vshr.s16 q4, q4, #1 - vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number) + vqadd.s16 q3, q3, q2 vqadd.s16 q4, q4, q2 - ;d6 - c1:temp1 - ;d7 - d1:temp2 - ;d8 - d1:temp1 - ;d9 - c1:temp2 - vqsub.s16 d10, d6, d9 ;c1 vqadd.s16 d11, d7, d8 ;d1 @@ -78,7 +84,7 @@ vshr.s16 q3, q3, #1 vshr.s16 q4, q4, #1 - vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number) + vqadd.s16 q3, q3, q2 vqadd.s16 q4, q4, q2 vqsub.s16 d10, d6, d9 ;c1 @@ -96,34 +102,29 @@ vrshr.s16 d4, d4, #3 vrshr.s16 d5, d5, #3 - add r1, r2, r3 - add r12, r1, r3 - add r0, r12, r3 - vtrn.32 d2, d4 vtrn.32 d3, d5 vtrn.16 d2, d3 vtrn.16 d4, d5 - vst1.16 {d2}, [r2] - vst1.16 {d3}, [r1] - vst1.16 {d4}, [r12] - vst1.16 {d5}, [r0] + vaddw.u8 q1, q1, d14 + vaddw.u8 q2, q2, d15 - bx lr + vqmovun.s16 d0, q1 + vqmovun.s16 d1, q2 + + vst1.32 {d0[0]}, [r3], r1 + vst1.32 {d0[1]}, [r3], r1 + vst1.32 {d1[0]}, [r3], r1 + vst1.32 {d1[1]}, [r3] - ENDP + bx lr -;----------------- - AREA didct4x4_dat, DATA, READWRITE ;read/write by default -;Data section with name data_area is specified. DCD reserves space in memory for 48 data. -;One word each is reserved. Label filter_coeff can be used to access the data. -;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... -_didct_coeff_ - DCD didct_coeff -didct_coeff - DCD 0x4e7b4e7b, 0x8a8c8a8c + ENDP ; |vp8_dequant_idct_add_neon| -;20091, 20091, 35468, 35468 +; Constant Pool +_CONSTANTS_ DCD cospi8sqrt2minus1 +cospi8sqrt2minus1 DCD 0x4e7b4e7b +sinpi8sqrt2 DCD 0x8a8c8a8c END diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c index 5668cef..f4c6be9 100644 --- a/vp8/decoder/decodframe.c +++ b/vp8/decoder/decodframe.c @@ -272,8 +272,10 @@ void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd) if (b->eob > 1) { - DEQUANT_INVOKE(&pbi->dequant, idct_dc_add)(b->qcoeff, &b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride, - xd->block[24].diff[i]); + DEQUANT_INVOKE(&pbi->dequant, dc_idct_add) + (b->qcoeff, &b->dequant[0][0], b->predictor, + *(b->base_dst) + b->dst, 16, b->dst_stride, + xd->block[24].diff[i]); } else { diff --git a/vp8/decoder/dequantize.c b/vp8/decoder/dequantize.c index 4c924ff..df7cf5f 100644 --- a/vp8/decoder/dequantize.c +++ b/vp8/decoder/dequantize.c @@ -32,10 +32,10 @@ void vp8_dequantize_b_c(BLOCKD *d) } } -void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride) +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred, + unsigned char *dest, int pitch, int stride) { - // output needs to be at least pitch * 4 for vp8_short_idct4x4llm_c to work properly - short output[16*4]; + short output[16]; short *diff_ptr = output; int r, c; int i; @@ -45,7 +45,8 @@ void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred, unsign input[i] = dq[i] * input[i]; } - vp8_short_idct4x4llm_c(input, output, pitch*2); + // the idct halves ( >> 1) the pitch + vp8_short_idct4x4llm_c(input, output, 4 << 1); vpx_memset(input, 0, 32); @@ -65,16 +66,17 @@ void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred, unsign } dest += stride; - diff_ptr += pitch; + diff_ptr += 4; pred += pitch; } } -void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc) +void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred, + unsigned char *dest, int pitch, int stride, + int Dc) { int i; - // output needs to be at least pitch * 4 for vp8_short_idct4x4llm_c to work properly - short output[16*4]; + short output[16]; short *diff_ptr = output; int r, c; @@ -85,7 +87,8 @@ void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred, uns input[i] = dq[i] * input[i]; } - vp8_short_idct4x4llm_c(input, output, pitch*2); + // the idct halves ( >> 1) the pitch + vp8_short_idct4x4llm_c(input, output, 4 << 1); vpx_memset(input, 0, 32); @@ -105,7 +108,7 @@ void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred, uns } dest += stride; - diff_ptr += pitch; + diff_ptr += 4; pred += pitch; } } diff --git a/vp8/decoder/dequantize.h b/vp8/decoder/dequantize.h index 50293c2..fbca391 100644 --- a/vp8/decoder/dequantize.h +++ b/vp8/decoder/dequantize.h @@ -21,7 +21,7 @@ unsigned char *pred, unsigned char *output, \ int pitch, int stride) -#define prototype_dequant_idct_dc_add(sym) \ +#define prototype_dequant_dc_idct_add(sym) \ void sym(short *input, short *dq, \ unsigned char *pred, unsigned char *output, \ int pitch, int stride, \ @@ -45,21 +45,21 @@ extern prototype_dequant_block(vp8_dequant_block); #endif extern prototype_dequant_idct_add(vp8_dequant_idct_add); -#ifndef vp8_dequant_idct_dc_add -#define vp8_dequant_idct_dc_add vp8_dequant_dc_idct_add_c +#ifndef vp8_dequant_dc_idct_add +#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_c #endif -extern prototype_dequant_idct_dc_add(vp8_dequant_idct_dc_add); +extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add); typedef prototype_dequant_block((*vp8_dequant_block_fn_t)); typedef prototype_dequant_idct_add((*vp8_dequant_idct_add_fn_t)); -typedef prototype_dequant_idct_dc_add((*vp8_dequant_idct_dc_add_fn_t)); +typedef prototype_dequant_dc_idct_add((*vp8_dequant_dc_idct_add_fn_t)); typedef struct { vp8_dequant_block_fn_t block; vp8_dequant_idct_add_fn_t idct_add; - vp8_dequant_idct_dc_add_fn_t idct_dc_add; + vp8_dequant_dc_idct_add_fn_t dc_idct_add; } vp8_dequant_rtcd_vtable_t; #if CONFIG_RUNTIME_CPU_DETECT diff --git a/vp8/decoder/generic/dsystemdependent.c b/vp8/decoder/generic/dsystemdependent.c index c72597f..ab085e2 100644 --- a/vp8/decoder/generic/dsystemdependent.c +++ b/vp8/decoder/generic/dsystemdependent.c @@ -22,7 +22,7 @@ void vp8_dmachine_specific_config(VP8D_COMP *pbi) pbi->mb.rtcd = &pbi->common.rtcd; pbi->dequant.block = vp8_dequantize_b_c; pbi->dequant.idct_add = vp8_dequant_idct_add_c; - pbi->dequant.idct_dc_add = vp8_dequant_dc_idct_add_c; + pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_c; pbi->dboolhuff.start = vp8dx_start_decode_c; pbi->dboolhuff.fill = vp8dx_bool_decoder_fill_c; #if 0 //For use with RTCD, when implemented diff --git a/vp8/decoder/x86/dequantize_x86.h b/vp8/decoder/x86/dequantize_x86.h index 32e7779..f0830a7 100644 --- a/vp8/decoder/x86/dequantize_x86.h +++ b/vp8/decoder/x86/dequantize_x86.h @@ -22,7 +22,7 @@ #if HAVE_MMX extern prototype_dequant_block(vp8_dequantize_b_mmx); extern prototype_dequant_idct_add(vp8_dequant_idct_add_mmx); -extern prototype_dequant_idct_dc_add(vp8_dequant_dc_idct_add_mmx); +extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_mmx); #if !CONFIG_RUNTIME_CPU_DETECT @@ -30,10 +30,10 @@ extern prototype_dequant_idct_dc_add(vp8_dequant_dc_idct_add_mmx); #define vp8_dequant_block vp8_dequantize_b_mmx #undef vp8_dequant_idct_add -#define vp8_dequant_idct_add vp8_dequant_idct_add_mmx +#define vp8_dequant_idct_add vp8_dequant_idct_mmx -#undef vp8_dequant_idct_dc -#define vp8_dequant_idct_add_dc vp8_dequant_dc_idct_add_mmx +#undef vp8_dequant_dc_idct_add +#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_mmx #endif #endif diff --git a/vp8/decoder/x86/x86_dsystemdependent.c b/vp8/decoder/x86/x86_dsystemdependent.c index d7bed08..7891051 100644 --- a/vp8/decoder/x86/x86_dsystemdependent.c +++ b/vp8/decoder/x86/x86_dsystemdependent.c @@ -44,7 +44,7 @@ void vp8_arch_x86_decode_init(VP8D_COMP *pbi) { pbi->dequant.block = vp8_dequantize_b_mmx; pbi->dequant.idct_add = vp8_dequant_idct_add_mmx; - pbi->dequant.idct_dc_add = vp8_dequant_dc_idct_add_mmx; + pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_mmx; } #endif diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index d993927..5b8a301 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -125,6 +125,7 @@ VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/systemdependent.c VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/vpx_asm_offsets.c VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6) += common/filter_c.c +VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6) += common/idctllm.c VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6) += common/recon.c VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6) += common/reconintra4x4.c VP8_COMMON_SRCS_REMOVE-$(HAVE_ARMV6) += common/generic/systemdependent.c @@ -134,6 +135,7 @@ VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/bilinearfilter_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/copymem8x4_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/copymem8x8_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/copymem16x16_v6$(ASM) +VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/dc_only_idct_add_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/iwalsh_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/filter_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/idct_v6$(ASM) @@ -150,6 +152,7 @@ VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/bilinearpredict16x16_neon$(ASM VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/copymem8x4_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/copymem8x8_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/copymem16x16_neon$(ASM) +VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/dc_only_idct_add_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/iwalsh_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfiltersimplehorizontaledge_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM) diff --git a/vp8/vp8dx_arm.mk b/vp8/vp8dx_arm.mk index e741680..e9674ca 100644 --- a/vp8/vp8dx_arm.mk +++ b/vp8/vp8dx_arm.mk @@ -23,12 +23,12 @@ VP8_DX_SRCS_REMOVE-$(HAVE_ARMV6) += decoder/generic/dsystemdependent.c #File list for armv6 # decoder -VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequantdcidct_v6$(ASM) -VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequantidct_v6$(ASM) +VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_dc_idct_v6$(ASM) +VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_idct_v6$(ASM) VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequantize_v6$(ASM) #File list for neon # decoder -VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequantdcidct_neon$(ASM) -VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequantidct_neon$(ASM) +VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequant_dc_idct_neon$(ASM) +VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequant_idct_neon$(ASM) VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequantizeb_neon$(ASM) -- 2.7.4