From 57e995ff9c01d8c09af50439c8c88876a234d205 Mon Sep 17 00:00:00 2001 From: Jingning Han Date: Fri, 8 Feb 2013 16:19:42 -0800 Subject: [PATCH] butterfly inverse 4x4 ADST fixed format issues. Implement the inverse 4x4 ADST using 9 multiplications. For this particular dimension, the original ADST transform can be factorized into simpler operations, hence is retained. Change-Id: Ie5d9749942468df299ab74e90d92cd899569e960 --- configure | 1 + vp9/common/vp9_blockd.h | 2 +- vp9/common/vp9_idct.h | 8 ++++ vp9/common/vp9_idctllm.c | 97 ++++++++++++++++++++++++++++++++++++++++++- vp9/common/vp9_invtrans.c | 9 +++- vp9/common/vp9_rtcd_defs.sh | 7 +++- vp9/decoder/vp9_dequantize.c | 6 ++- vp9/encoder/vp9_encodeintra.c | 10 ++++- vp9/encoder/vp9_rdopt.c | 4 ++ 9 files changed, 136 insertions(+), 8 deletions(-) diff --git a/configure b/configure index 0409078..dc44175 100755 --- a/configure +++ b/configure @@ -250,6 +250,7 @@ EXPERIMENT_LIST=" enable_6tap abovesprefmv intht + intht4x4 " CONFIG_LIST=" external_build diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index 0d51f06..b2c15fc 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -408,7 +408,7 @@ typedef struct macroblockd { #define ACTIVE_HT8 300 -#define ACTIVE_HT16 300 +#define ACTIVE_HT16 0 // convert MB_PREDICTION_MODE to B_PREDICTION_MODE static B_PREDICTION_MODE pred_mode_conv(MB_PREDICTION_MODE mode) { diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h index 680a206..01e8ea3 100644 --- a/vp9/common/vp9_idct.h +++ b/vp9/common/vp9_idct.h @@ -50,6 +50,14 @@ static const int cospi_29_64 = 2404; static const int cospi_30_64 = 1606; static const int cospi_31_64 = 804; +#if CONFIG_INTHT4X4 +// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3 +static const int sinpi_1_9 = 5283; +static const int sinpi_2_9 = 9929; +static const int sinpi_3_9 = 13377; +static const int sinpi_4_9 = 15212; +#endif + static INLINE int dct_const_round_shift(int input) { int rv = (input + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; assert((rv <= INT16_MAX) && (rv >= INT16_MIN)); diff --git a/vp9/common/vp9_idctllm.c b/vp9/common/vp9_idctllm.c index b27b34c..2f847dc 100644 --- a/vp9/common/vp9_idctllm.c +++ b/vp9/common/vp9_idctllm.c @@ -494,7 +494,6 @@ void vp9_dc_only_inv_walsh_add_c(short input_dc, uint8_t *pred_ptr, } #endif - void idct4_1d(int16_t *input, int16_t *output) { int16_t step[4]; int temp1, temp2; @@ -651,6 +650,100 @@ void vp9_short_idct8x8_c(int16_t *input, int16_t *output, int pitch) { } } +#if CONFIG_INTHT4X4 +static void iadst4_1d(int16_t *input, int16_t *output) { + int x0, x1, x2, x3; + int s0, s1, s2, s3, s4, s5, s6, s7; + + x0 = input[0]; + x1 = input[1]; + x2 = input[2]; + x3 = input[3]; + + if (!(x0 | x1 | x2 | x3)) { + output[0] = output[1] = output[2] = output[3] = 0; + return; + } + + s0 = sinpi_1_9 * x0; + s1 = sinpi_2_9 * x0; + s2 = sinpi_3_9 * x1; + s3 = sinpi_4_9 * x2; + s4 = sinpi_1_9 * x2; + s5 = sinpi_2_9 * x3; + s6 = sinpi_4_9 * x3; + s7 = x0 - x2 + x3; + + x0 = s0 + s3 + s5; + x1 = s1 - s4 - s6; + x2 = sinpi_3_9 * s7; + x3 = s2; + + s0 = x0 + x3; + s1 = x1 + x3; + s2 = x2; + s3 = x0 + x1 - x3; + + // 1-D transform scaling factor is sqrt(2). + // The overall dynamic range is 14b (input) + 14b (multiplication scaling) + // + 1b (addition) = 29b. + // Hence the output bit depth is 15b. + output[0] = dct_const_round_shift(s0); + output[1] = dct_const_round_shift(s1); + output[2] = dct_const_round_shift(s2); + output[3] = dct_const_round_shift(s3); +} + +void vp9_short_iht4x4_c(int16_t *input, int16_t *output, + int pitch, TX_TYPE tx_type) { + int16_t out[16]; + int16_t *outptr = &out[0]; + const int short_pitch = pitch >> 1; + int i, j; + int16_t temp_in[4], temp_out[4]; + + void (*invr)(int16_t*, int16_t*); + void (*invc)(int16_t*, int16_t*); + + switch (tx_type) { + case ADST_ADST: + invc = &iadst4_1d; + invr = &iadst4_1d; + break; + case ADST_DCT: + invc = &iadst4_1d; + invr = &idct4_1d; + break; + case DCT_ADST: + invc = &idct4_1d; + invr = &iadst4_1d; + break; + case DCT_DCT: + invc = &idct4_1d; + invr = &idct4_1d; + break; + default: + assert(0); + } + + // inverse transform row vectors + for (i = 0; i < 4; ++i) { + invr(input, outptr); + input += 4; + outptr += 4; + } + + // inverse transform column vectors + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = out[j * 4 + i]; + invc(temp_in, temp_out); + for (j = 0; j < 4; ++j) + output[j * short_pitch + i] = (temp_out[j] + 8) >> 4; + } +} +#endif + #if CONFIG_INTHT static void iadst8_1d(int16_t *input, int16_t *output) { int x0, x1, x2, x3, x4, x5, x6, x7; @@ -733,7 +826,7 @@ static void iadst8_1d(int16_t *input, int16_t *output) { } void vp9_short_iht8x8_c(int16_t *input, int16_t *output, - TX_TYPE tx_type, int pitch) { + int pitch, TX_TYPE tx_type) { int16_t out[8 * 8]; int16_t *outptr = &out[0]; const int short_pitch = pitch >> 1; diff --git a/vp9/common/vp9_invtrans.c b/vp9/common/vp9_invtrans.c index c81fe2d..e7cfe20 100644 --- a/vp9/common/vp9_invtrans.c +++ b/vp9/common/vp9_invtrans.c @@ -51,8 +51,13 @@ void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd) { for (i = 0; i < 16; i++) { TX_TYPE tx_type = get_tx_type_4x4(xd, &xd->block[i]); if (tx_type != DCT_DCT) { +#if CONFIG_INTHT4X4 + vp9_short_iht4x4(xd->block[i].dqcoeff, xd->block[i].diff, + 32, tx_type); +#else vp9_ihtllm(xd->block[i].dqcoeff, xd->block[i].diff, 32, tx_type, 4, xd->block[i].eob); +#endif } else { vp9_inverse_transform_b_4x4(xd, i, 32); } @@ -93,7 +98,7 @@ void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd) { if (tx_type != DCT_DCT) { #if CONFIG_INTHT vp9_short_iht8x8(xd->block[i].dqcoeff, xd->block[i].diff, - tx_type, 32); + 32, tx_type); #else vp9_ihtllm(xd->block[i].dqcoeff, xd->block[i].diff, 32, tx_type, 8, xd->block[i].eob); @@ -108,7 +113,7 @@ void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd) { if (tx_type != DCT_DCT) { #if CONFIG_INTHT vp9_short_iht8x8(xd->block[i + 2].dqcoeff, xd->block[i].diff, - tx_type, 32); + 32, tx_type); #else vp9_ihtllm(xd->block[i + 2].dqcoeff, xd->block[i].diff, 32, tx_type, 8, xd->block[i + 2].eob); diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 4dce0c9..0669892 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -300,10 +300,15 @@ prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output" specialize vp9_short_idct1_32x32 #if CONFIG_INTHT -prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int tx_type, int pitch" +prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int pitch, int tx_type" specialize vp9_short_iht8x8 #endif +#if CONFIG_INTHT4X4 +prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type" +specialize vp9_short_iht4x4 +#endif + prototype void vp9_ihtllm "const int16_t *input, int16_t *output, int pitch, int tx_type, int tx_dim, int16_t eobs" specialize vp9_ihtllm diff --git a/vp9/decoder/vp9_dequantize.c b/vp9/decoder/vp9_dequantize.c index 839a918..1f64767 100644 --- a/vp9/decoder/vp9_dequantize.c +++ b/vp9/decoder/vp9_dequantize.c @@ -69,7 +69,11 @@ void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input, input[i] = dq[i] * input[i]; } +#if CONFIG_INTHT4X4 + vp9_short_iht4x4(input, output, 8, tx_type); +#else vp9_ihtllm(input, output, 4 << 1, tx_type, 4, eobs); +#endif vpx_memset(input, 0, 32); @@ -93,7 +97,7 @@ void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input, } #if CONFIG_INTHT - vp9_short_iht8x8(input, output, tx_type, 16); + vp9_short_iht8x8(input, output, 16, tx_type); #else vp9_ihtllm(input, output, 16, tx_type, 8, eobs); #endif diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c index fa72297..1dd3013 100644 --- a/vp9/encoder/vp9_encodeintra.c +++ b/vp9/encoder/vp9_encodeintra.c @@ -56,7 +56,11 @@ void vp9_encode_intra4x4block(MACROBLOCK *x, int ib) { if (tx_type != DCT_DCT) { vp9_fht(be->src_diff, 32, be->coeff, tx_type, 4); vp9_ht_quantize_b_4x4(be, b, tx_type); +#if CONFIG_INTHT4X4 + vp9_short_iht4x4(b->dqcoeff, b->diff, 32, tx_type); +#else vp9_ihtllm(b->dqcoeff, b->diff, 32, tx_type, 4, b->eob); +#endif } else { x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32); x->quantize_b_4x4(be, b) ; @@ -155,7 +159,7 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) { #if CONFIG_INTHT vp9_short_iht8x8(xd->block[idx].dqcoeff, xd->block[ib].diff, - tx_type, 32); + 32, tx_type); #else vp9_ihtllm(xd->block[idx].dqcoeff, xd->block[ib].diff, 32, tx_type, 8, xd->block[idx].eob); @@ -173,7 +177,11 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) { if (tx_type != DCT_DCT) { vp9_fht_c(be->src_diff, 32, be->coeff, tx_type, 4); vp9_ht_quantize_b_4x4(be, b, tx_type); +#if CONFIG_INTHT4X4 + vp9_short_iht4x4(b->dqcoeff, b->diff, 32, tx_type); +#else vp9_ihtllm(b->dqcoeff, b->diff, 32, tx_type, 4, b->eob); +#endif } else if (!(i & 1) && get_tx_type_4x4(xd, b + 1) == DCT_DCT) { x->vp9_short_fdct8x4(be->src_diff, be->coeff, 32); x->quantize_b_4x4_pair(be, be + 1, b, b + 1); diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 317209b..29893b8 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -1170,7 +1170,11 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be, // inverse transform if (best_tx_type != DCT_DCT) +#if CONFIG_INTHT4X4 + vp9_short_iht4x4(best_dqcoeff, b->diff, 32, best_tx_type); +#else vp9_ihtllm(best_dqcoeff, b->diff, 32, best_tx_type, 4, b->eob); +#endif else xd->inv_xform4x4_x8(best_dqcoeff, b->diff, 32); -- 2.7.4