From 4c3ad66b7f63c1e69318941448533a6a7f05c77e Mon Sep 17 00:00:00 2001 From: Tero Rintaluoma Date: Mon, 19 Sep 2011 10:24:02 +0300 Subject: [PATCH] Updated ARMv6 forward transforms to match C - Updated walsh transform to match C (based on Change Id24f3392) - Changed fast_fdct4x4 and 8x4 to short_fdct4x4 and 8x4 correspondingly Change-Id: I704e862f40e315b0a79997633c7bd9c347166a8e --- vp8/encoder/arm/arm_csystemdependent.c | 8 +- ...ct4x4_armv6.asm => vp8_short_fdct4x4_armv6.asm} | 4 +- vp8/encoder/arm/armv6/walsh_v6.asm | 301 +++++++++++++-------- vp8/encoder/arm/dct_arm.c | 8 +- vp8/encoder/arm/dct_arm.h | 14 +- vp8/vp8cx_arm.mk | 2 +- 6 files changed, 204 insertions(+), 133 deletions(-) rename vp8/encoder/arm/armv6/{vp8_fast_fdct4x4_armv6.asm => vp8_short_fdct4x4_armv6.asm} (99%) diff --git a/vp8/encoder/arm/arm_csystemdependent.c b/vp8/encoder/arm/arm_csystemdependent.c index 89f8136..a6572b3 100644 --- a/vp8/encoder/arm/arm_csystemdependent.c +++ b/vp8/encoder/arm/arm_csystemdependent.c @@ -58,10 +58,10 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi) /*cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_c;*/ - /*cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c; - cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c;*/ - cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_armv6; - cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_armv6; + cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_armv6; + cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_armv6; + cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_armv6; + cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_armv6; cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_armv6; /*cpi->rtcd.encodemb.berr = vp8_block_error_c; diff --git a/vp8/encoder/arm/armv6/vp8_fast_fdct4x4_armv6.asm b/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm similarity index 99% rename from vp8/encoder/arm/armv6/vp8_fast_fdct4x4_armv6.asm rename to vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm index 65bd2b4..8034c1d 100644 --- a/vp8/encoder/arm/armv6/vp8_fast_fdct4x4_armv6.asm +++ b/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm @@ -8,7 +8,7 @@ ; be found in the AUTHORS file in the root of the source tree. ; - EXPORT |vp8_fast_fdct4x4_armv6| + EXPORT |vp8_short_fdct4x4_armv6| ARM REQUIRE8 @@ -16,7 +16,7 @@ AREA |.text|, CODE, READONLY ; void vp8_short_fdct4x4_c(short *input, short *output, int pitch) -|vp8_fast_fdct4x4_armv6| PROC +|vp8_short_fdct4x4_armv6| PROC stmfd sp!, {r4 - r12, lr} diff --git a/vp8/encoder/arm/armv6/walsh_v6.asm b/vp8/encoder/arm/armv6/walsh_v6.asm index 61ffdb3..5eaf3f2 100644 --- a/vp8/encoder/arm/armv6/walsh_v6.asm +++ b/vp8/encoder/arm/armv6/walsh_v6.asm @@ -17,129 +17,196 @@ AREA |.text|, CODE, READONLY ; name this block of code ;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch) +; r0 short *input, +; r1 short *output, +; r2 int pitch |vp8_short_walsh4x4_armv6| PROC stmdb sp!, {r4 - r11, lr} - mov r12, r2 ; ugh. not clean - ldr r2, [r0] ; [1 | 0] - ldr r3, [r0, #4] ; [3 | 2] - ldr r4, [r0, r12]! ; [5 | 4] - ldr r5, [r0, #4] ; [7 | 6] - ldr r6, [r0, r12]! ; [9 | 8] - ldr r7, [r0, #4] ; [11 | 10] - ldr r8, [r0, r12]! ; [13 | 12] - ldr r9, [r0, #4] ; [15 | 14] - - qsubaddx r10, r2, r3 ; [c1|a1] [1-2 | 0+3] - qaddsubx r11, r2, r3 ; [b1|d1] [1+2 | 0-3] - qsubaddx r12, r4, r5 ; [c1|a1] [5-6 | 4+7] - qaddsubx lr, r4, r5 ; [b1|d1] [5+6 | 4-7] - - qaddsubx r2, r10, r11 ; [1 | 2] [c1+d1 | a1-b1] - qaddsubx r3, r11, r10 ; [0 | 3] [b1+a1 | d1-c1] - qaddsubx r4, r12, lr ; [5 | 6] [c1+d1 | a1-b1] - qaddsubx r5, lr, r12 ; [4 | 7] [b1+a1 | d1-c1] - - qsubaddx r10, r6, r7 ; [c1|a1] [9-10 | 8+11] - qaddsubx r11, r6, r7 ; [b1|d1] [9+10 | 8-11] - qsubaddx r12, r8, r9 ; [c1|a1] [13-14 | 12+15] - qaddsubx lr, r8, r9 ; [b1|d1] [13+14 | 12-15] - - qaddsubx r6, r10, r11 ; [9 |10] [c1+d1 | a1-b1] - qaddsubx r7, r11, r10 ; [8 |11] [b1+a1 | d1-c1] - qaddsubx r8, r12, lr ; [13|14] [c1+d1 | a1-b1] - qaddsubx r9, lr, r12 ; [12|15] [b1+a1 | d1-c1] - - ; first transform complete - - qadd16 r10, r3, r9 ; a1 [0+12 | 3+15] - qadd16 r11, r5, r7 ; b1 [4+8 | 7+11] - qsub16 r12, r5, r7 ; c1 [4-8 | 7-11] - qsub16 lr, r3, r9 ; d1 [0-12 | 3-15] - - qadd16 r3, r10, r11 ; a2 [a1+b1] [0 | 3] - qadd16 r5, r12, lr ; b2 [c1+d1] [4 | 7] - qsub16 r7, r10, r11 ; c2 [a1-b1] [8 |11] - qsub16 r9, lr, r12 ; d2 [d1-c1] [12|15] - - qadd16 r10, r2, r8 ; a1 [1+13 | 2+14] - qadd16 r11, r4, r6 ; b1 [5+9 | 6+10] - qsub16 r12, r4, r6 ; c1 [5-9 | 6-10] - qsub16 lr, r2, r8 ; d1 [1-13 | 2-14] - - qadd16 r2, r10, r11 ; a2 [a1+b1] [1 | 2] - qadd16 r4, r12, lr ; b2 [c1+d1] [5 | 6] - qsub16 r6, r10, r11 ; c2 [a1-b1] [9 |10] - qsub16 r8, lr, r12 ; d2 [d1-c1] [13|14] - - ; [a-d]2 += ([a-d]2 > 0) - - asrs r10, r3, #16 - addpl r10, r10, #1 ; [~0] - asrs r11, r2, #16 - addpl r11, r11, #1 ; [~1] - lsl r11, r11, #15 ; [1 | x] - pkhtb r10, r11, r10, asr #1; [1 | 0] - str r10, [r1], #4 - - lsls r11, r2, #16 - addpl r11, r11, #0x10000 ; [~2] - lsls r12, r3, #16 - addpl r12, r12, #0x10000 ; [~3] - asr r12, r12, #1 ; [3 | x] - pkhtb r11, r12, r11, asr #17; [3 | 2] - str r11, [r1], #4 - - asrs r2, r5, #16 - addpl r2, r2, #1 ; [~4] - asrs r3, r4, #16 - addpl r3, r3, #1 ; [~5] - lsl r3, r3, #15 ; [5 | x] - pkhtb r2, r3, r2, asr #1 ; [5 | 4] - str r2, [r1], #4 - - lsls r2, r4, #16 - addpl r2, r2, #0x10000 ; [~6] - lsls r3, r5, #16 - addpl r3, r3, #0x10000 ; [~7] - asr r3, r3, #1 ; [7 | x] - pkhtb r2, r3, r2, asr #17 ; [7 | 6] - str r2, [r1], #4 - - asrs r2, r7, #16 - addpl r2, r2, #1 ; [~8] - asrs r3, r6, #16 - addpl r3, r3, #1 ; [~9] - lsl r3, r3, #15 ; [9 | x] - pkhtb r2, r3, r2, asr #1 ; [9 | 8] - str r2, [r1], #4 - - lsls r2, r6, #16 - addpl r2, r2, #0x10000 ; [~10] - lsls r3, r7, #16 - addpl r3, r3, #0x10000 ; [~11] - asr r3, r3, #1 ; [11 | x] - pkhtb r2, r3, r2, asr #17 ; [11 | 10] - str r2, [r1], #4 - - asrs r2, r9, #16 - addpl r2, r2, #1 ; [~12] - asrs r3, r8, #16 - addpl r3, r3, #1 ; [~13] - lsl r3, r3, #15 ; [13 | x] - pkhtb r2, r3, r2, asr #1 ; [13 | 12] - str r2, [r1], #4 - - lsls r2, r8, #16 - addpl r2, r2, #0x10000 ; [~14] - lsls r3, r9, #16 - addpl r3, r3, #0x10000 ; [~15] - asr r3, r3, #1 ; [15 | x] - pkhtb r2, r3, r2, asr #17 ; [15 | 14] - str r2, [r1] + ldrd r4, r5, [r0], r2 + ldr lr, c00040004 + ldrd r6, r7, [r0], r2 + + ; 0-3 + qadd16 r3, r4, r5 ; [d1|a1] [1+3 | 0+2] + qsub16 r4, r4, r5 ; [c1|b1] [1-3 | 0-2] + + ldrd r8, r9, [r0], r2 + ; 4-7 + qadd16 r5, r6, r7 ; [d1|a1] [5+7 | 4+6] + qsub16 r6, r6, r7 ; [c1|b1] [5-7 | 4-6] + + ldrd r10, r11, [r0] + ; 8-11 + qadd16 r7, r8, r9 ; [d1|a1] [9+11 | 8+10] + qsub16 r8, r8, r9 ; [c1|b1] [9-11 | 8-10] + + ; 12-15 + qadd16 r9, r10, r11 ; [d1|a1] [13+15 | 12+14] + qsub16 r10, r10, r11 ; [c1|b1] [13-15 | 12-14] + + + lsls r2, r3, #16 + smuad r11, r3, lr ; A0 = a1<<2 + d1<<2 + addne r11, r11, #1 ; A0 += (a1!=0) + + lsls r2, r7, #16 + smuad r12, r7, lr ; C0 = a1<<2 + d1<<2 + addne r12, r12, #1 ; C0 += (a1!=0) + + add r0, r11, r12 ; a1_0 = A0 + C0 + sub r11, r11, r12 ; b1_0 = A0 - C0 + + lsls r2, r5, #16 + smuad r12, r5, lr ; B0 = a1<<2 + d1<<2 + addne r12, r12, #1 ; B0 += (a1!=0) + + lsls r2, r9, #16 + smuad r2, r9, lr ; D0 = a1<<2 + d1<<2 + addne r2, r2, #1 ; D0 += (a1!=0) + + add lr, r12, r2 ; d1_0 = B0 + D0 + sub r12, r12, r2 ; c1_0 = B0 - D0 + + ; op[0,4,8,12] + adds r2, r0, lr ; a2 = a1_0 + d1_0 + addmi r2, r2, #1 ; += a2 < 0 + add r2, r2, #3 ; += 3 + subs r0, r0, lr ; d2 = a1_0 - d1_0 + mov r2, r2, asr #3 ; >> 3 + strh r2, [r1] ; op[0] + + addmi r0, r0, #1 ; += a2 < 0 + add r0, r0, #3 ; += 3 + ldr lr, c00040004 + mov r0, r0, asr #3 ; >> 3 + strh r0, [r1, #24] ; op[12] + + adds r2, r11, r12 ; b2 = b1_0 + c1_0 + addmi r2, r2, #1 ; += a2 < 0 + add r2, r2, #3 ; += 3 + subs r0, r11, r12 ; c2 = b1_0 - c1_0 + mov r2, r2, asr #3 ; >> 3 + strh r2, [r1, #8] ; op[4] + + addmi r0, r0, #1 ; += a2 < 0 + add r0, r0, #3 ; += 3 + smusd r3, r3, lr ; A3 = a1<<2 - d1<<2 + smusd r7, r7, lr ; C3 = a1<<2 - d1<<2 + mov r0, r0, asr #3 ; >> 3 + strh r0, [r1, #16] ; op[8] + + + ; op[3,7,11,15] + add r0, r3, r7 ; a1_3 = A3 + C3 + sub r3, r3, r7 ; b1_3 = A3 - C3 + + smusd r5, r5, lr ; B3 = a1<<2 - d1<<2 + smusd r9, r9, lr ; D3 = a1<<2 - d1<<2 + add r7, r5, r9 ; d1_3 = B3 + D3 + sub r5, r5, r9 ; c1_3 = B3 - D3 + + adds r2, r0, r7 ; a2 = a1_3 + d1_3 + addmi r2, r2, #1 ; += a2 < 0 + add r2, r2, #3 ; += 3 + adds r9, r3, r5 ; b2 = b1_3 + c1_3 + mov r2, r2, asr #3 ; >> 3 + strh r2, [r1, #6] ; op[3] + + addmi r9, r9, #1 ; += a2 < 0 + add r9, r9, #3 ; += 3 + subs r2, r3, r5 ; c2 = b1_3 - c1_3 + mov r9, r9, asr #3 ; >> 3 + strh r9, [r1, #14] ; op[7] + + addmi r2, r2, #1 ; += a2 < 0 + add r2, r2, #3 ; += 3 + subs r9, r0, r7 ; d2 = a1_3 - d1_3 + mov r2, r2, asr #3 ; >> 3 + strh r2, [r1, #22] ; op[11] + + addmi r9, r9, #1 ; += a2 < 0 + add r9, r9, #3 ; += 3 + smuad r3, r4, lr ; A1 = b1<<2 + c1<<2 + smuad r5, r8, lr ; C1 = b1<<2 + c1<<2 + mov r9, r9, asr #3 ; >> 3 + strh r9, [r1, #30] ; op[15] + + ; op[1,5,9,13] + add r0, r3, r5 ; a1_1 = A1 + C1 + sub r3, r3, r5 ; b1_1 = A1 - C1 + + smuad r7, r6, lr ; B1 = b1<<2 + c1<<2 + smuad r9, r10, lr ; D1 = b1<<2 + c1<<2 + add r5, r7, r9 ; d1_1 = B1 + D1 + sub r7, r7, r9 ; c1_1 = B1 - D1 + + adds r2, r0, r5 ; a2 = a1_1 + d1_1 + addmi r2, r2, #1 ; += a2 < 0 + add r2, r2, #3 ; += 3 + adds r9, r3, r7 ; b2 = b1_1 + c1_1 + mov r2, r2, asr #3 ; >> 3 + strh r2, [r1, #2] ; op[1] + + addmi r9, r9, #1 ; += a2 < 0 + add r9, r9, #3 ; += 3 + subs r2, r3, r7 ; c2 = b1_1 - c1_1 + mov r9, r9, asr #3 ; >> 3 + strh r9, [r1, #10] ; op[5] + + addmi r2, r2, #1 ; += a2 < 0 + add r2, r2, #3 ; += 3 + subs r9, r0, r5 ; d2 = a1_1 - d1_1 + mov r2, r2, asr #3 ; >> 3 + strh r2, [r1, #18] ; op[9] + + addmi r9, r9, #1 ; += a2 < 0 + add r9, r9, #3 ; += 3 + smusd r4, r4, lr ; A2 = b1<<2 - c1<<2 + smusd r8, r8, lr ; C2 = b1<<2 - c1<<2 + mov r9, r9, asr #3 ; >> 3 + strh r9, [r1, #26] ; op[13] + + + ; op[2,6,10,14] + add r11, r4, r8 ; a1_2 = A2 + C2 + sub r12, r4, r8 ; b1_2 = A2 - C2 + + smusd r6, r6, lr ; B2 = b1<<2 - c1<<2 + smusd r10, r10, lr ; D2 = b1<<2 - c1<<2 + add r4, r6, r10 ; d1_2 = B2 + D2 + sub r8, r6, r10 ; c1_2 = B2 - D2 + + adds r2, r11, r4 ; a2 = a1_2 + d1_2 + addmi r2, r2, #1 ; += a2 < 0 + add r2, r2, #3 ; += 3 + adds r9, r12, r8 ; b2 = b1_2 + c1_2 + mov r2, r2, asr #3 ; >> 3 + strh r2, [r1, #4] ; op[2] + + addmi r9, r9, #1 ; += a2 < 0 + add r9, r9, #3 ; += 3 + subs r2, r12, r8 ; c2 = b1_2 - c1_2 + mov r9, r9, asr #3 ; >> 3 + strh r9, [r1, #12] ; op[6] + + addmi r2, r2, #1 ; += a2 < 0 + add r2, r2, #3 ; += 3 + subs r9, r11, r4 ; d2 = a1_2 - d1_2 + mov r2, r2, asr #3 ; >> 3 + strh r2, [r1, #20] ; op[10] + + addmi r9, r9, #1 ; += a2 < 0 + add r9, r9, #3 ; += 3 + mov r9, r9, asr #3 ; >> 3 + strh r9, [r1, #28] ; op[14] + ldmia sp!, {r4 - r11, pc} ENDP ; |vp8_short_walsh4x4_armv6| +c00040004 + DCD 0x00040004 + END diff --git a/vp8/encoder/arm/dct_arm.c b/vp8/encoder/arm/dct_arm.c index 60d649d..2692acb 100644 --- a/vp8/encoder/arm/dct_arm.c +++ b/vp8/encoder/arm/dct_arm.c @@ -13,12 +13,10 @@ #if HAVE_ARMV6 -void vp8_fast_fdct8x4_armv6(short *input, short *output, int pitch) +void vp8_short_fdct8x4_armv6(short *input, short *output, int pitch) { - vp8_fast_fdct4x4_armv6(input, output, pitch); - vp8_fast_fdct4x4_armv6(input + 4, output + 16, pitch); + vp8_short_fdct4x4_armv6(input, output, pitch); + vp8_short_fdct4x4_armv6(input + 4, output + 16, pitch); } #endif /* HAVE_ARMV6 */ - - diff --git a/vp8/encoder/arm/dct_arm.h b/vp8/encoder/arm/dct_arm.h index 769d5f4..f94022b 100644 --- a/vp8/encoder/arm/dct_arm.h +++ b/vp8/encoder/arm/dct_arm.h @@ -14,18 +14,24 @@ #if HAVE_ARMV6 extern prototype_fdct(vp8_short_walsh4x4_armv6); -extern prototype_fdct(vp8_fast_fdct4x4_armv6); -extern prototype_fdct(vp8_fast_fdct8x4_armv6); +extern prototype_fdct(vp8_short_fdct4x4_armv6); +extern prototype_fdct(vp8_short_fdct8x4_armv6); #if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_fdct_walsh_short4x4 #define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_armv6 +#undef vp8_fdct_short4x4 +#define vp8_fdct_short4x4 vp8_short_fdct4x4_armv6 + +#undef vp8_fdct_short8x4 +#define vp8_fdct_short8x4 vp8_short_fdct8x4_armv6 + #undef vp8_fdct_fast4x4 -#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_armv6 +#define vp8_fdct_fast4x4 vp8_short_fdct4x4_armv6 #undef vp8_fdct_fast8x4 -#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_armv6 +#define vp8_fdct_fast8x4 vp8_short_fdct8x4_armv6 #endif #endif /* HAVE_ARMV6 */ diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk index e8dbd5d..4a860f4 100644 --- a/vp8/vp8cx_arm.mk +++ b/vp8/vp8cx_arm.mk @@ -36,7 +36,7 @@ VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/armv5te/vp8_packtokens_partitions_ar #File list for armv6 # encoder VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_subtract_armv6$(ASM) -VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_fast_fdct4x4_armv6$(ASM) +VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM) VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM) VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_sad16x16_armv6$(ASM) VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM) -- 2.7.4