From: Yunqing Wang Date: Wed, 7 Nov 2012 00:06:22 +0000 (-0800) Subject: Optimize 16x16 dequant and idct X-Git-Tag: v1.3.0~1217^2~118^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=6c17c9fae0f54942abe480414b4c0db7b2476412;p=platform%2Fupstream%2Flibvpx.git Optimize 16x16 dequant and idct As suggested by Yaowu, simplified 16x16 dequant and idct. In decoder, after detoken step, we know the number of non-zero dct coefficients (eobs) in a macroblock. Idct calculation can be skipped or simplified based on eobs, which improves the decoder performance. Change-Id: I9ffa1cb134bcb5a7d64fcf90c81871a96d1b4018 --- diff --git a/vp9/common/generic/systemdependent.c b/vp9/common/generic/systemdependent.c index ed98d30..a3d6cb4 100644 --- a/vp9/common/generic/systemdependent.c +++ b/vp9/common/generic/systemdependent.c @@ -32,6 +32,7 @@ void vp9_machine_specific_config(VP9_COMMON *ctx) { rtcd->idct.idct1_scalar_add_8x8 = vp9_dc_only_idct_add_8x8_c; rtcd->idct.ihaar2 = vp9_short_ihaar2x2_c; rtcd->idct.idct16x16 = vp9_short_idct16x16_c; + rtcd->idct.idct10_16x16 = vp9_short_idct10_16x16_c; rtcd->subpix.eighttap16x16 = vp9_eighttap_predict16x16_c; rtcd->subpix.eighttap8x8 = vp9_eighttap_predict8x8_c; diff --git a/vp9/common/idct.h b/vp9/common/idct.h index cbe1169..b8d3121 100644 --- a/vp9/common/idct.h +++ b/vp9/common/idct.h @@ -50,6 +50,11 @@ #endif extern prototype_idct(vp9_idct_idct16x16); +#ifndef vp9_idct_idct10_16x16 +#define vp9_idct_idct10_16x16 vp9_short_idct10_16x16_c +#endif +extern prototype_idct(vp9_idct_idct10_16x16); + #ifndef vp9_idct_idct8 #define vp9_idct_idct8 vp9_short_idct8x8_c #endif @@ -133,6 +138,7 @@ typedef struct { vp9_idct_fn_t ihaar2_1; vp9_idct_fn_t idct16x16; + vp9_idct_fn_t idct10_16x16; } vp9_idct_rtcd_vtable_t; #if CONFIG_RUNTIME_CPU_DETECT diff --git a/vp9/common/idctllm.c b/vp9/common/idctllm.c index 4b0ac55..3efc094 100644 --- a/vp9/common/idctllm.c +++ b/vp9/common/idctllm.c @@ -1502,6 +1502,161 @@ void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) { output[j * 16 + i] = temp_out[j]; } } + +/* The following function is called when we know the maximum number of non-zero + * dct coefficients is less or equal 10. + */ +static void butterfly_16x16_idct10_1d(int16_t input[16], int16_t output[16], + int last_shift_bits) { + int16_t step[16] = {0}; + int intermediate[16] = {0}; + int temp1, temp2; + int last_rounding = 0; + + if (last_shift_bits > 0) + last_rounding = 1 << (last_shift_bits - 1); + + // step 1 and 2 + step[ 0] = (input[0] + INITIAL_ROUNDING) >> INITIAL_SHIFT; + step[ 1] = (input[0] + INITIAL_ROUNDING) >> INITIAL_SHIFT; + + temp1 = (2 * (input[2] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT; + step[ 4] = (temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT; + step[ 5] = (temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT; + + // for odd input + temp1 = (input[3] * C12 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + temp1 *= C8; + intermediate[ 8] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = (-input[3] * C4 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + temp1 *= C8; + intermediate[ 9] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + step[ 8] = (intermediate[ 8] + INITIAL_ROUNDING) >> INITIAL_SHIFT; + step[ 9] = (intermediate[ 9] + INITIAL_ROUNDING) >> INITIAL_SHIFT; + step[10] = (-input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT; + step[11] = (input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT; + step[12] = (input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT; + step[13] = (input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT; + step[14] = (intermediate[ 8] + INITIAL_ROUNDING) >> INITIAL_SHIFT; + step[15] = (intermediate[ 9] + INITIAL_ROUNDING) >> INITIAL_SHIFT; + + // step 3 + output[0] = step[ 0]; + output[1] = step[ 1]; + output[2] = step[ 1]; + output[3] = step[ 0]; + + temp1 = step[ 4] * C14; + output[4] = (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = step[ 4] * C2; + output[7] = (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = step[ 5] * C10; + output[5] = (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = step[ 5] * C6; + output[6] = (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + output[8] = step[ 8] + step[11]; + output[9] = step[ 9] + step[10]; + output[10] = step[ 9] - step[10]; + output[11] = step[ 8] - step[11]; + output[12] = step[12] + step[15]; + output[13] = step[13] + step[14]; + output[14] = step[13] - step[14]; + output[15] = step[12] - step[15]; + + // output 4 + step[ 0] = output[0] + output[7]; + step[ 1] = output[1] + output[6]; + step[ 2] = output[2] + output[5]; + step[ 3] = output[3] + output[4]; + step[ 4] = output[3] - output[4]; + step[ 5] = output[2] - output[5]; + step[ 6] = output[1] - output[6]; + step[ 7] = output[0] - output[7]; + + temp1 = output[8] * C7; + temp2 = output[15] * C9; + step[ 8] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = output[9] * C11; + temp2 = output[14] * C5; + step[ 9] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = output[10] * C3; + temp2 = output[13] * C13; + step[10] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = output[11] * C15; + temp2 = output[12] * C1; + step[11] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = output[11] * C1; + temp2 = output[12] * C15; + step[12] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = output[10] * C13; + temp2 = output[13] * C3; + step[13] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = output[9] * C5; + temp2 = output[14] * C11; + step[14] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = output[8] * C9; + temp2 = output[15] * C7; + step[15] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + // step 5 + output[0] = (step[0] + step[15] + last_rounding) >> last_shift_bits; + output[1] = (step[1] + step[14] + last_rounding) >> last_shift_bits; + output[2] = (step[2] + step[13] + last_rounding) >> last_shift_bits; + output[3] = (step[3] + step[12] + last_rounding) >> last_shift_bits; + output[4] = (step[4] + step[11] + last_rounding) >> last_shift_bits; + output[5] = (step[5] + step[10] + last_rounding) >> last_shift_bits; + output[6] = (step[6] + step[ 9] + last_rounding) >> last_shift_bits; + output[7] = (step[7] + step[ 8] + last_rounding) >> last_shift_bits; + + output[15] = (step[0] - step[15] + last_rounding) >> last_shift_bits; + output[14] = (step[1] - step[14] + last_rounding) >> last_shift_bits; + output[13] = (step[2] - step[13] + last_rounding) >> last_shift_bits; + output[12] = (step[3] - step[12] + last_rounding) >> last_shift_bits; + output[11] = (step[4] - step[11] + last_rounding) >> last_shift_bits; + output[10] = (step[5] - step[10] + last_rounding) >> last_shift_bits; + output[9] = (step[6] - step[ 9] + last_rounding) >> last_shift_bits; + output[8] = (step[7] - step[ 8] + last_rounding) >> last_shift_bits; +} + +void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) { + int16_t out[16 * 16]; + int16_t *outptr = &out[0]; + const int short_pitch = pitch >> 1; + int i, j; + int16_t temp_in[16], temp_out[16]; + + /* First transform rows. Since all non-zero dct coefficients are in + * upper-left 4x4 area, we only need to calculate first 4 rows here. + */ + vpx_memset(out, 0, sizeof(out)); + for (i = 0; i < 4; ++i) { + butterfly_16x16_idct10_1d(input, outptr, 0); + input += short_pitch; + outptr += 16; + } + + // Then transform columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j*16 + i]; + butterfly_16x16_idct10_1d(temp_in, temp_out, 3); + for (j = 0; j < 16; ++j) + output[j*16 + i] = temp_out[j]; + } +} #undef INITIAL_SHIFT #undef INITIAL_ROUNDING #undef RIGHT_SHIFT diff --git a/vp9/common/rtcd_defs.sh b/vp9/common/rtcd_defs.sh index 3ad845b..2f1edee 100644 --- a/vp9/common/rtcd_defs.sh +++ b/vp9/common/rtcd_defs.sh @@ -54,7 +54,7 @@ specialize vp9_dequant_idct_add_y_block_8x8 prototype void vp9_dequant_idct_add_uv_block_8x8 "short *q, short *dq, unsigned char *pre, unsigned char *dstu, unsigned char *dstv, int stride, unsigned short *eobs, struct macroblockd *xd" specialize vp9_dequant_idct_add_uv_block_8x8 -prototype void vp9_dequant_idct_add_16x16 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride" +prototype void vp9_dequant_idct_add_16x16 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, unsigned short eobs" specialize vp9_dequant_idct_add_16x16 prototype void vp9_dequant_idct_add_8x8 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride" diff --git a/vp9/decoder/decodframe.c b/vp9/decoder/decodframe.c index 60bf7ba..9826f85 100644 --- a/vp9/decoder/decodframe.c +++ b/vp9/decoder/decodframe.c @@ -401,7 +401,7 @@ static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd, } else { vp9_dequant_idct_add_16x16(xd->qcoeff, xd->block[0].dequant, xd->predictor, xd->dst.y_buffer, - 16, xd->dst.y_stride); + 16, xd->dst.y_stride, xd->eobs[0]); } } else if (tx_size == TX_8X8) { #if CONFIG_SUPERBLOCKS diff --git a/vp9/decoder/dequantize.c b/vp9/decoder/dequantize.c index d11fe38..e564262 100644 --- a/vp9/decoder/dequantize.c +++ b/vp9/decoder/dequantize.c @@ -19,6 +19,28 @@ extern int dec_debug; #endif +static void recon(int16_t *diff, uint8_t *pred, int pitch, uint8_t *dest, + int stride, int width, int height) { + int r, c; + + for (r = 0; r < height; r++) { + for (c = 0; c < width; c++) { + int a = diff[c] + pred[c]; + + if (a < 0) + a = 0; + else if (a > 255) + a = 255; + + dest[c] = (uint8_t) a; + } + + dest += stride; + diff += width; + pred += pitch; + } +} + void vp9_dequantize_b_c(BLOCKD *d) { int i; @@ -37,7 +59,6 @@ void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, short *dq, int pitch, int stride) { short output[16]; short *diff_ptr = output; - int r, c; int i; for (i = 0; i < 16; i++) { @@ -48,23 +69,7 @@ void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, short *dq, vpx_memset(input, 0, 32); - for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) { - int a = diff_ptr[c] + pred[c]; - - if (a < 0) - a = 0; - - if (a > 255) - a = 255; - - dest[c] = (unsigned char) a; - } - - dest += stride; - diff_ptr += 4; - pred += pitch; - } + recon(diff_ptr, pred, pitch, dest, stride, 4, 4); } void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, short *dq, @@ -115,7 +120,6 @@ void vp9_dequant_idct_add_c(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride) { short output[16]; short *diff_ptr = output; - int r, c; int i; for (i = 0; i < 16; i++) { @@ -127,23 +131,7 @@ void vp9_dequant_idct_add_c(short *input, short *dq, unsigned char *pred, vpx_memset(input, 0, 32); - for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) { - int a = diff_ptr[c] + pred[c]; - - if (a < 0) - a = 0; - - if (a > 255) - a = 255; - - dest[c] = (unsigned char) a; - } - - dest += stride; - diff_ptr += 4; - pred += pitch; - } + recon(diff_ptr, pred, pitch, dest, stride, 4, 4); } void vp9_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred, @@ -152,7 +140,6 @@ void vp9_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred, int i; short output[16]; short *diff_ptr = output; - int r, c; input[0] = (short)Dc; @@ -165,23 +152,7 @@ void vp9_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred, vpx_memset(input, 0, 32); - for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) { - int a = diff_ptr[c] + pred[c]; - - if (a < 0) - a = 0; - - if (a > 255) - a = 255; - - dest[c] = (unsigned char) a; - } - - dest += stride; - diff_ptr += 4; - pred += pitch; - } + recon(diff_ptr, pred, pitch, dest, stride, 4, 4); } #if CONFIG_LOSSLESS @@ -190,7 +161,6 @@ void vp9_dequant_idct_add_lossless_c(short *input, short *dq, int pitch, int stride) { short output[16]; short *diff_ptr = output; - int r, c; int i; for (i = 0; i < 16; i++) { @@ -201,23 +171,7 @@ void vp9_dequant_idct_add_lossless_c(short *input, short *dq, vpx_memset(input, 0, 32); - for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) { - int a = diff_ptr[c] + pred[c]; - - if (a < 0) - a = 0; - - if (a > 255) - a = 255; - - dest[c] = (unsigned char) a; - } - - dest += stride; - diff_ptr += 4; - pred += pitch; - } + recon(diff_ptr, pred, pitch, dest, stride, 4, 4); } void vp9_dequant_dc_idct_add_lossless_c(short *input, short *dq, @@ -227,7 +181,6 @@ void vp9_dequant_dc_idct_add_lossless_c(short *input, short *dq, int i; short output[16]; short *diff_ptr = output; - int r, c; input[0] = (short)dc; @@ -238,23 +191,7 @@ void vp9_dequant_dc_idct_add_lossless_c(short *input, short *dq, vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1); vpx_memset(input, 0, 32); - for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) { - int a = diff_ptr[c] + pred[c]; - - if (a < 0) - a = 0; - - if (a > 255) - a = 255; - - dest[c] = (unsigned char) a; - } - - dest += stride; - diff_ptr += 4; - pred += pitch; - } + recon(diff_ptr, pred, pitch, dest, stride, 4, 4); } #endif @@ -461,7 +398,7 @@ void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, short *input, short *dq, int pitch, int stride) { short output[256]; short *diff_ptr = output; - int r, c, i; + int i; input[0]= input[0] * dq[0]; @@ -477,55 +414,80 @@ void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, short *input, short *dq, vpx_memset(input, 0, 512); - for (r = 0; r < 16; r++) { - for (c = 0; c < 16; c++) { - int a = diff_ptr[c] + pred[c]; - - if (a < 0) - a = 0; - else if (a > 255) - a = 255; - - dest[c] = (unsigned char) a; - } - - dest += stride; - diff_ptr += 16; - pred += pitch; - } + recon(diff_ptr, pred, pitch, dest, stride, 16, 16); } -void vp9_dequant_idct_add_16x16_c(short *input, short *dq, unsigned char *pred, - unsigned char *dest, int pitch, int stride) { - short output[256]; - short *diff_ptr = output; +void vp9_dequant_idct_add_16x16_c(int16_t *input, int16_t *dq, uint8_t *pred, + uint8_t *dest, int pitch, int stride, + uint16_t eobs) { + int16_t output[256]; + int16_t *diff_ptr = output; int r, c, i; - input[0]= input[0] * dq[0]; + /* The calculation can be simplified if there are not many non-zero dct + * coefficients. Use eobs to separate different cases. */ + if (eobs == 0) { + /* All 0 DCT coefficient */ + vp9_copy_mem16x16(pred, pitch, dest, stride); + } else if (eobs == 1) { + /* DC only DCT coefficient. */ + int16_t out; - // recover quantizer for 4 4x4 blocks - for (i = 1; i < 256; i++) - input[i] = input[i] * dq[1]; + out = (input[0] * dq[0] + 2) >> 2; + out = (out + 2) >> 2; + out = (out + 4) >> 3; - // the idct halves ( >> 1) the pitch - vp9_short_idct16x16_c(input, output, 32); + input[0] = 0; - vpx_memset(input, 0, 512); + for (r = 0; r < 16; r++) { + for (c = 0; c < 16; c++) { + int a = out + pred[c]; - for (r = 0; r < 16; r++) { - for (c = 0; c < 16; c++) { - int a = diff_ptr[c] + pred[c]; + if (a < 0) + a = 0; + else if (a > 255) + a = 255; - if (a < 0) - a = 0; - else if (a > 255) - a = 255; + dest[c] = (uint8_t) a; + } - dest[c] = (unsigned char) a; + dest += stride; + pred += pitch; } - dest += stride; - diff_ptr += 16; - pred += pitch; + } else if (eobs <= 10) { + input[0]= input[0] * dq[0]; + input[1] = input[1] * dq[1]; + input[2] = input[2] * dq[1]; + input[3] = input[3] * dq[1]; + input[16] = input[16] * dq[1]; + input[17] = input[17] * dq[1]; + input[18] = input[18] * dq[1]; + input[32] = input[32] * dq[1]; + input[33] = input[33] * dq[1]; + input[48] = input[48] * dq[1]; + + // the idct halves ( >> 1) the pitch + vp9_short_idct10_16x16_c(input, output, 32); + + input[0] = input[1] = input[2] = input[3] = 0; + input[16] = input[17] = input[18] = 0; + input[32] = input[33] = 0; + input[48] = 0; + + recon(diff_ptr, pred, pitch, dest, stride, 16, 16); + } else { + input[0]= input[0] * dq[0]; + + // recover quantizer for 4 4x4 blocks + for (i = 1; i < 256; i++) + input[i] = input[i] * dq[1]; + + // the idct halves ( >> 1) the pitch + vp9_short_idct16x16_c(input, output, 32); + + vpx_memset(input, 0, 512); + + recon(diff_ptr, pred, pitch, dest, stride, 16, 16); } }