From 9602799cd9ecd0529e291f8d1af951bf2fde787b Mon Sep 17 00:00:00 2001 From: Johann Date: Thu, 12 Aug 2010 09:05:37 -0400 Subject: [PATCH] framework for assembly version of the detokenizer adds a compile time option: --enable-arm-asm-detok which pulls in vp8/decoder/arm/detokenize.asm currently about break even speed wise, but changes are pending to the fill code (branch and load 3 bytes versus conditionally always load one) and the error handling. Currently it doesn't handle zero runs or overrunning the buffer. this is really just so i don't have to rebase my changes all the time to run benchmarks - now just need to replace one file! Change-Id: I56d0e2354dc0ca3811bffd0e88fe1f952fa6c797 --- configure | 3 + vp8/decoder/arm/detokenize.asm | 333 +++++++++++++++++++++++++++++++++++++++ vp8/decoder/arm/detokenize_arm.h | 22 +++ vp8/decoder/detokenize.c | 59 +++++++ vp8/decoder/detokenize.h | 10 +- vp8/decoder/onyxd_if.c | 7 +- vp8/vp8dx_arm.mk | 9 +- 7 files changed, 429 insertions(+), 14 deletions(-) create mode 100644 vp8/decoder/arm/detokenize.asm create mode 100644 vp8/decoder/arm/detokenize_arm.h diff --git a/configure b/configure index 5c908d4..ac3d162 100755 --- a/configure +++ b/configure @@ -38,6 +38,7 @@ Advanced options: ${toggle_realtime_only} enable this option while building for real-time encoding ${toggle_runtime_cpu_detect} runtime cpu detection ${toggle_shared} shared library support + ${toggle_arm_asm_detok} assembly version of the detokenizer (ARM platforms only) Codecs: Codecs can be selectively enabled or disabled individually, or by family: @@ -242,6 +243,7 @@ CONFIG_LIST=" spatial_resampling realtime_only shared + arm_asm_detok " CMDLINE_SELECT=" extra_warnings @@ -278,6 +280,7 @@ CMDLINE_SELECT=" spatial_resampling realtime_only shared + arm_asm_detok " process_cmdline() { diff --git a/vp8/decoder/arm/detokenize.asm b/vp8/decoder/arm/detokenize.asm new file mode 100644 index 0000000..bafacb9 --- /dev/null +++ b/vp8/decoder/arm/detokenize.asm @@ -0,0 +1,333 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_decode_mb_tokens_v6| + + AREA |.text|, CODE, READONLY ; name this block of code + + INCLUDE vpx_asm_offsets.asm + +l_qcoeff EQU 0 +l_i EQU 4 +l_type EQU 8 +l_stop EQU 12 +l_c EQU 16 +l_l_ptr EQU 20 +l_a_ptr EQU 24 +l_bc EQU 28 +l_coef_ptr EQU 32 +l_stacksize EQU 64 + + +;; constant offsets -- these should be created at build time +c_onyxblock2left_offset EQU 25 +c_onyxblock2above_offset EQU 50 +c_entropy_nodes EQU 11 +c_dct_eob_token EQU 11 + +|vp8_decode_mb_tokens_v6| PROC + stmdb sp!, {r4 - r11, lr} + sub sp, sp, #l_stacksize + mov r7, r1 ; type + mov r9, r0 ; detoken + + ldr r1, [r9, #detok_current_bc] + ldr r0, [r9, #detok_qcoeff_start_ptr] + mov r11, #0 ; i + mov r3, #0x10 ; stop + + cmp r7, #1 ; type ?= 1 + addeq r11, r11, #24 ; i = 24 + addeq r3, r3, #8 ; stop = 24 + addeq r0, r0, #3, 24 ; qcoefptr += 24*16 ?CHECKME + + str r0, [sp, #l_qcoeff] + str r11, [sp, #l_i] + str r7, [sp, #l_type] + str r3, [sp, #l_stop] + str r1, [sp, #l_bc] + + add lr, r9, r7, lsl #2 ; detoken + type*4 + + ldr r8, [r1, #bool_decoder_user_buffer] + + ldr r10, [lr, #detok_coef_probs] ; coef_probs[type] + ldr r5, [r1, #bool_decoder_count] + ldr r6, [r1, #bool_decoder_range] + ldr r4, [r1, #bool_decoder_value] + + str r10, [sp, #l_coef_ptr] + + ;align 4 +BLOCK_LOOP + ldr r3, [r9, #detok_ptr_onyxblock2context_leftabove] + ldr r2, [r9, #detok_A] + ldr r1, [r9, #detok_L] + ldrb r12, [r3, r11]! ; onyxblock2context[i] + + cmp r7, #0 ; c = !type + moveq r7, #1 + movne r7, #0 + + ldr r0, [r2, r12, lsl #2] ; A[onyxblock2context[i]] + add r1, r1, r12, lsl #4 ; L + onyxblock2context[i] << 4 + ; A is ptr to ptr (**) + ; L is ptr to data (*[4]) + + ldrb r2, [r3, #c_onyxblock2above_offset] ; + above offset + ldrb r3, [r3, #c_onyxblock2left_offset] ; + left offset + mov lr, #c_entropy_nodes ; ENTROPY_NODES = 11 +;; ;++ + + ldr r2, [r0, r2, lsl #2]! ; A + above offset + ldr r3, [r1, r3, lsl #2]! ; L + left offset +; VP8_COMBINEENTROPYCONTETEXTS(t, *a, *l) => t = ((*a) != 0) + ((*l) !=0) + cmp r2, #0 ; *a ?= 0 + movne r2, #1 ; haha if a == 0 no need to set up another var to state that pretty sweet :) + cmp r3, #0 ; *l ?= 0 + addne r2, r2, #1 ; t + + str r1, [sp, #l_l_ptr] ; save &l + str r0, [sp, #l_a_ptr] ; save &a + smlabb r0, r2, lr, r10 ; Prob = coef_probs + (t * ENTROPY_NODES) + mov r1, #0 ; t = 0 + str r7, [sp, #l_c] + + ;align 4 +COEFF_LOOP + ldr r3, [r9, #detok_ptr_onyx_coef_bands_x] + ldr lr, [r9, #detok_onyx_coef_tree_ptr] + + ; onyx_coef_bands_x is UINT16 + add r3, r3, r7, lsl #1 ; coef_bands_x[c] + ldrh r3, [r3] ; UINT16 + + ;++ + add r0, r0, r3 ; Prob += coef_bands_x[c] + + ;align 4 +get_token_loop + ldrb r2, [r0, +r1, asr #1] ; Prob[t >> 1] + mov r3, r6, lsl #8 ; range << 8 + sub r3, r3, #256 ; (range << 8) - (1 << 8) + mov r10, #1 ; 1 + + smlawb r2, r3, r2, r10 ; split = 1 + (((range-1) * probability) >> 8) + + ldrb r12, [r8] ; load cx data byte in stall slot : r8 = bufptr + ;++ + + subs r3, r4, r2, lsl #24 ; value-(split<<24): used later to calculate shift for NORMALIZE + addhs r1, r1, #1 ; t += 1 + movhs r4, r3 ; value -= bigsplit (split << 24) + subhs r2, r6, r2 ; range -= split + ; movlo r6, r2 ; range = split + + ldrsb r1, [lr, r1] ; t = onyx_coef_tree_ptr[t] + +; NORMALIZE + clz r3, r2 ; vp8dx_bitreader_norm[range] + 24 + sub r3, r3, #24 ; vp8dx_bitreader_norm[range] + subs r5, r5, r3 ; count -= shift + mov r6, r2, lsl r3 ; range <<= shift + mov r4, r4, lsl r3 ; value <<= shift + +; if count <= 0, += BR_COUNT; value |= *bufptr++ << (BR_COUNT-count); BR_COUNT = 8, but need to upshift values by +16 + addle r5, r5, #8 ; count += 8 + rsble r3, r5, #24 ; 24 - count + addle r8, r8, #1 ; bufptr++ + orrle r4, r4, r12, lsl r3 ; value |= *bufptr << shift + 16 + + cmp r1, #0 ; t ?= 0 + bgt get_token_loop ; while (t > 0) + + cmn r1, #c_dct_eob_token ; if(t == -DCT_EOB_TOKEN) + beq END_OF_BLOCK ; break + + rsb lr, r1, #0 ; v = -t; + + cmp lr, #4 ; if(v > FOUR_TOKEN) + ble SKIP_EXTRABITS + + ldr r3, [r9, #detok_teb_base_ptr] + mov r11, #1 ; 1 in split = 1 + ... nope, v+= 1 << bits_count + add r7, r3, lr, lsl #4 ; detok_teb_base_ptr + (v << 4) + + ldrsh lr, [r7, #tokenextrabits_min_val] ; v = teb_ptr->min_val + ldrsh r0, [r7, #tokenextrabits_length] ; bits_count = teb_ptr->Length + +extrabits_loop + add r3, r0, r7 ; &teb_ptr->Probs[bits_count] + + ldrb r2, [r3, #4] ; probability. why +4? + mov r3, r6, lsl #8 ; range << 8 + sub r3, r3, #256 ; range << 8 + 1 << 8 + + smlawb r2, r3, r2, r11 ; split = 1 + (((range-1) * probability) >> 8) + + ldrb r12, [r8] ; *bufptr + ;++ + + subs r10, r4, r2, lsl #24 ; value - (split<<24) + movhs r4, r10 ; value = value - (split << 24) + subhs r2, r6, r2 ; range = range - split + addhs lr, lr, r11, lsl r0 ; v += ((UINT16)1<> 1 + + subs r3, r4, r2, lsl #24 ; value - (split<<24) + movhs r4, r3 ; value -= (split << 24) + subhs r2, r6, r2 ; range -= split + mvnhs r3, lr ; -v + addhs lr, r3, #1 ; v = (v ^ -1) + 1 + +; NORMALIZE + clz r3, r2 ; leading 0s in split + sub r3, r3, #24 ; shift + subs r5, r5, r3 ; count -= shift + mov r6, r2, lsl r3 ; range <<= shift + mov r4, r4, lsl r3 ; value <<= shift + ldrleb r2, [r8], #1 ; *(bufptr++) + addle r5, r5, #8 ; count += 8 + rsble r3, r5, #24 ; BR_COUNT - count + orrle r4, r4, r2, lsl r3 ; value |= *bufptr << (BR_COUNT - count) + + add r0, r0, #0xB ; Prob += ENTROPY_NODES (11) + + cmn r1, #1 ; t < -ONE_TOKEN + + addlt r0, r0, #0xB ; Prob += ENTROPY_NODES (11) + + mvn r1, #1 ; t = -1 ???? C is -2 + +SKIP_EOB_CHECK + ldr r7, [sp, #l_c] ; c + ldr r3, [r9, #detok_scan] + add r1, r1, #2 ; t+= 2 + cmp r7, #(0x10 - 1) ; c should will be one higher + + ldr r3, [r3, +r7, lsl #2] ; scan[c] this needs pre-inc c value + add r7, r7, #1 ; c++ + add r3, r11, r3, lsl #1 ; qcoeff + scan[c] + + str r7, [sp, #l_c] ; store c + strh lr, [r3] ; qcoef_ptr[scan[c]] = v + + blt COEFF_LOOP + + sub r7, r7, #1 ; if(t != -DCT_EOB_TOKEN) --c ; never stored! no condition! + +END_OF_BLOCK + ldr r3, [sp, #l_type] ; type + ldr r10, [sp, #l_coef_ptr] ; coef_ptr + ldr r0, [sp, #l_qcoeff] ; qcoeff + ldr r11, [sp, #l_i] ; i + ldr r12, [sp, #l_stop] ; stop + + cmp r3, #0 ; type ?= 0 + moveq r1, #1 + movne r1, #0 + add r3, r11, r9 ; detok + i + + cmp r7, r1 ; c ?= !type + strb r7, [r3, #detok_eob] ; eob[i] = c + + ldr r7, [sp, #l_l_ptr] ; l + ldr r2, [sp, #l_a_ptr] ; a + movne r3, #1 ; t + moveq r3, #0 + + add r0, r0, #0x20 ; qcoeff += 32 (16 * 2?) + add r11, r11, #1 ; i++ + str r3, [r7] ; *l = t + str r3, [r2] ; *a = t + str r0, [sp, #l_qcoeff] ; qcoeff + str r11, [sp, #l_i] ; i + + cmp r11, r12 ; i >= stop ? VERIFY should be strictly LT(<)? + ldr r7, [sp, #l_type] ; type + mov lr, #0xB ; 11 (ENTORPY_NODES?) + + blt BLOCK_LOOP + + cmp r11, #0x19 ; i ?= 25 + bne ln2_decode_mb_to + + ldr r12, [r9, #detok_qcoeff_start_ptr] + ldr r10, [r9, #detok_coef_probs] + mov r7, #0 ; type/i = 0 + mov r3, #0x10 ; stop = 0 + str r12, [sp, #l_qcoeff] ; qcoeff_ptr = qcoeff_start_ptr + str r7, [sp, #l_i] + str r7, [sp, #l_type] + str r3, [sp, #l_stop] + + str r10, [sp, #l_coef_ptr] ; coef_probs = coef_probs[type] (0) + + b BLOCK_LOOP + +ln2_decode_mb_to + cmp r11, #0x10 ; i ?= 16 + bne ln1_decode_mb_to + + mov r10, #detok_coef_probs + add r10, r10, #2*4 ; coef_probs[type] + ldr r10, [r9, r10] ; detok + 48 - THIS IS PROBABLY THE ISSUE: NEW STRUCTURE + + mov r7, #2 ; type = 2 + mov r3, #0x18 ; stop = 24 + + str r7, [sp, #l_type] + str r3, [sp, #l_stop] + + str r10, [sp, #l_coef_ptr] ; coef_probs = coef_probs[type] - didn't want to add 2 to coef_probs + b BLOCK_LOOP + +ln1_decode_mb_to + ldr r2, [sp, #l_bc] + mov r0, #0 + nop + + str r8, [r2, #bool_decoder_user_buffer] + str r5, [r2, #bool_decoder_count] + str r4, [r2, #bool_decoder_value] + str r6, [r2, #bool_decoder_range] + + add sp, sp, #l_stacksize + ldmia sp!, {r4 - r11, pc} + + ENDP ; |vp8_decode_mb_tokens_v6| + + END diff --git a/vp8/decoder/arm/detokenize_arm.h b/vp8/decoder/arm/detokenize_arm.h new file mode 100644 index 0000000..1c53f7b --- /dev/null +++ b/vp8/decoder/arm/detokenize_arm.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef DETOKENIZE_ARM_H +#define DETOKENIZE_ARM_H + +#if HAVE_ARMV6 +#if CONFIG_ARM_ASM_DETOK +void vp8_init_detokenizer(VP8D_COMP *dx); +void vp8_decode_mb_tokens_v6(DETOK *detoken, int type); +#endif +#endif + +#endif diff --git a/vp8/decoder/detokenize.c b/vp8/decoder/detokenize.c index 7407417..34faae3 100644 --- a/vp8/decoder/detokenize.c +++ b/vp8/decoder/detokenize.c @@ -14,6 +14,7 @@ #include "onyxd_int.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" +#include "detokenize.h" #define BOOL_DATA UINT8 @@ -103,6 +104,34 @@ void vp8_reset_mb_tokens_context(MACROBLOCKD *x) *l = 0; } } + +#if CONFIG_ARM_ASM_DETOK +DECLARE_ALIGNED(16, const UINT8, vp8_block2context_leftabove[25*3]) = +{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, //end of vp8_block2context + 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 0, 0, 1, 1, 0, 0, 1, 1, 0, //end of vp8_block2left + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0 //end of vp8_block2above +}; + +void vp8_init_detokenizer(VP8D_COMP *dx) +{ + const VP8_COMMON *const oc = & dx->common; + MACROBLOCKD *x = & dx->mb; + + dx->detoken.vp8_coef_tree_ptr = vp8_coef_tree; + dx->detoken.ptr_onyxblock2context_leftabove = vp8_block2context_leftabove; + dx->detoken.ptr_onyx_coef_bands_x = vp8_coef_bands_x; + dx->detoken.scan = vp8_default_zig_zag1d; + dx->detoken.teb_base_ptr = vp8d_token_extra_bits2; + dx->detoken.qcoeff_start_ptr = &x->qcoeff[0]; + + dx->detoken.coef_probs[0] = (oc->fc.coef_probs [0] [ 0 ] [0]); + dx->detoken.coef_probs[1] = (oc->fc.coef_probs [1] [ 0 ] [0]); + dx->detoken.coef_probs[2] = (oc->fc.coef_probs [2] [ 0 ] [0]); + dx->detoken.coef_probs[3] = (oc->fc.coef_probs [3] [ 0 ] [0]); +} +#endif + DECLARE_ALIGNED(16, extern const unsigned int, vp8dx_bitreader_norm[256]); #define FILL \ if(count < 0) \ @@ -200,6 +229,35 @@ DECLARE_ALIGNED(16, extern const unsigned int, vp8dx_bitreader_norm[256]); }\ NORMALIZE +#if CONFIG_ARM_ASM_DETOK +int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x) +{ + int eobtotal = 0; + int i, type; + + dx->detoken.current_bc = x->current_bc; + dx->detoken.A = x->above_context; + dx->detoken.L = x->left_context; + + type = 3; + + if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV) + { + type = 1; + eobtotal -= 16; + } + + vp8_decode_mb_tokens_v6(&dx->detoken, type); + + for (i = 0; i < 25; i++) + { + x->block[i].eob = dx->detoken.eob[i]; + eobtotal += dx->detoken.eob[i]; + } + + return eobtotal; +} +#else int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x) { ENTROPY_CONTEXT **const A = x->above_context; @@ -395,3 +453,4 @@ BLOCK_FINISHED: return eobtotal; } +#endif //!CONFIG_ASM_DETOK diff --git a/vp8/decoder/detokenize.h b/vp8/decoder/detokenize.h index 2f6b4a9..aa98dea 100644 --- a/vp8/decoder/detokenize.h +++ b/vp8/decoder/detokenize.h @@ -9,12 +9,16 @@ */ -#ifndef detokenize_h -#define detokenize_h 1 +#ifndef DETOKENIZE_H +#define DETOKENIZE_H #include "onyxd_int.h" +#if ARCH_ARM +#include "arm/detokenize_arm.h" +#endif + void vp8_reset_mb_tokens_context(MACROBLOCKD *x); int vp8_decode_mb_tokens(VP8D_COMP *, MACROBLOCKD *); -#endif /* detokenize_h */ +#endif /* DETOKENIZE_H */ diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c index 728d5ca..5a88ba0 100644 --- a/vp8/decoder/onyxd_if.c +++ b/vp8/decoder/onyxd_if.c @@ -29,13 +29,11 @@ #include "vpx_scale/vpxscale.h" #include "systemdependent.h" #include "vpx_ports/vpx_timer.h" - +#include "detokenize.h" extern void vp8_init_loop_filter(VP8_COMMON *cm); - extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi); -// DEBUG code #if CONFIG_DEBUG void vp8_recon_write_yuv_frame(unsigned char *name, YV12_BUFFER_CONFIG *s) { @@ -129,6 +127,9 @@ VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf) cm->last_sharpness_level = cm->sharpness_level; } +#if CONFIG_ARM_ASM_DETOK + vp8_init_detokenizer(pbi); +#endif pbi->common.error.setjmp = 0; return (VP8D_PTR) pbi; } diff --git a/vp8/vp8dx_arm.mk b/vp8/vp8dx_arm.mk index e9674ca..d40f76e 100644 --- a/vp8/vp8dx_arm.mk +++ b/vp8/vp8dx_arm.mk @@ -11,24 +11,17 @@ #VP8_DX_SRCS list is modified according to different platforms. -#File list for arm -# decoder -#VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/decodframe_arm.c VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/dequantize_arm.c VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/dsystemdependent.c - -#VP8_DX_SRCS_REMOVE-$(HAVE_ARMV6) += decoder/decodframe.c -VP8_DX_SRCS_REMOVE-$(HAVE_ARMV6) += decoder/dequantize.c VP8_DX_SRCS_REMOVE-$(HAVE_ARMV6) += decoder/generic/dsystemdependent.c +VP8_DX_SRCS-$(CONFIG_ARM_ASM_DETOK) += decoder/arm/detokenize$(ASM) #File list for armv6 -# decoder VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_dc_idct_v6$(ASM) VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_idct_v6$(ASM) VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequantize_v6$(ASM) #File list for neon -# decoder VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequant_dc_idct_neon$(ASM) VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequant_idct_neon$(ASM) VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequantizeb_neon$(ASM) -- 2.7.4