From 1d5380787a30e8d37b3c925babaffd2d996ddea4 Mon Sep 17 00:00:00 2001 From: Jingning Han Date: Mon, 9 Jul 2018 11:07:52 -0700 Subject: [PATCH] Add 32x32 Hadamard transform Add 32x32 Hadamard transform in C implementation. Replace the forward 32x32 2D-DCT in tpl model with Hadamard transform. This would reduce the overhead encoding time due to running tpl model by ~3x. Change-Id: I1c743dab786b818d89f14928cc3998d056830aa9 --- vp9/encoder/vp9_encoder.c | 4 ++-- vpx_dsp/avg.c | 31 +++++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 ++++++ 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index b17ea9f..71bdb6e 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -5822,7 +5822,7 @@ void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, int frame_idx) { vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, dst_stride); - vpx_fdct32x32(src_diff, coeff, bw); + vpx_hadamard_32x32(src_diff, bw, coeff); intra_cost = vpx_satd(coeff, pix_num); @@ -5879,7 +5879,7 @@ void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, int frame_idx) { this_frame->y_buffer + mb_y_offset, this_frame->y_stride, &predictor[0], bw); #endif - vpx_fdct32x32(src_diff, coeff, bw); + vpx_hadamard_32x32(src_diff, bw, coeff); inter_cost = vpx_satd(coeff, pix_num); diff --git a/vpx_dsp/avg.c b/vpx_dsp/avg.c index a7ac6d9..0936e91 100644 --- a/vpx_dsp/avg.c +++ b/vpx_dsp/avg.c @@ -123,6 +123,37 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, } } +void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; + for (idx = 0; idx < 4; ++idx) { + // src_diff: 9 bit, dynamic range [-255, 255] + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + vpx_hadamard_16x16(src_ptr, src_stride, coeff + idx * 256); + } + + // coeff: 15 bit, dynamic range [-16320, 16320] + for (idx = 0; idx < 256; ++idx) { + tran_low_t a0 = coeff[0]; + tran_low_t a1 = coeff[256]; + tran_low_t a2 = coeff[512]; + tran_low_t a3 = coeff[768]; + + tran_low_t b0 = (a0 + a1) >> 2; // (a0 + a1): 16 bit, [-32640, 32640] + tran_low_t b1 = (a0 - a1) >> 2; // b0-b3: 15 bit, dynamic range + tran_low_t b2 = (a2 + a3) >> 2; // [-16320, 16320] + tran_low_t b3 = (a2 - a3) >> 2; + + coeff[0] = b0 + b2; // 16 bit, [-32640, 32640] + coeff[256] = b1 + b3; + coeff[512] = b0 - b2; + coeff[768] = b1 - b3; + + ++coeff; + } +} + // coeff: 16 bits, dynamic range [-32640, 32640]. // length: value range {16, 64, 256, 1024}. int vpx_satd_c(const tran_low_t *coeff, int length) { diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 45acaf3..b662c70 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -782,6 +782,9 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx/; + add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; + specialize qw/vpx_hadamard_32x32/; + add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length"; specialize qw/vpx_satd avx2 sse2 neon/; } else { @@ -791,6 +794,9 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff"; specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx/; + add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff"; + specialize qw/vpx_hadamard_32x32/; + add_proto qw/int vpx_satd/, "const int16_t *coeff, int length"; specialize qw/vpx_satd avx2 sse2 neon msa/; } -- 2.7.4