From 8e3d0e4d7db867caa110e96fa0fd1ff9ba37cb9f Mon Sep 17 00:00:00 2001 From: Jingning Han Date: Wed, 15 May 2013 12:19:59 -0700 Subject: [PATCH] Add building blocks for 4x8/8x4 rd search These building blocks enable rate-distortion optimization search over block sizes of 8x4 and 4x8. Need to convert them into mmx/sse forms. Change-Id: I570ea2d22d14ceec3fe3575128d7dfa172a577de --- vp9/common/vp9_blockd.h | 2 +- vp9/common/vp9_rtcd_defs.sh | 27 ++++++++++++ vp9/encoder/vp9_onyx_if.c | 12 ++++-- vp9/encoder/vp9_rdopt.c | 97 ++++++++++++++++++++++++-------------------- vp9/encoder/vp9_sad_c.c | 45 ++++++++++++++++++++ vp9/encoder/vp9_variance_c.c | 88 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 221 insertions(+), 50 deletions(-) diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index ad38730..b148b18 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -272,7 +272,7 @@ typedef struct { typedef struct { MB_MODE_INFO mbmi; - union b_mode_info bmi[4]; + union b_mode_info bmi[16]; } MODE_INFO; struct scale_factors { diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 48ce7db..b55cc74 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -367,6 +367,19 @@ vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance8x8 +# TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form +prototype unsigned int vp9_sub_pixel_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_sub_pixel_variance8x4 + +prototype unsigned int vp9_sub_pixel_avg_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" +specialize vp9_sub_pixel_avg_variance8x4 + +prototype unsigned int vp9_sub_pixel_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_sub_pixel_variance4x8 + +prototype unsigned int vp9_sub_pixel_avg_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" +specialize vp9_sub_pixel_avg_variance4x8 + prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance4x4 sse2 mmx vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt @@ -404,6 +417,13 @@ specialize vp9_sad8x16 mmx sse2 prototype unsigned int vp9_sad8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" specialize vp9_sad8x8 mmx sse2 +# TODO(jingning): need to covert these functions into mmx/sse2 form +prototype unsigned int vp9_sad8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" +specialize vp9_sad8x4 + +prototype unsigned int vp9_sad4x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" +specialize vp9_sad4x8 + prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" specialize vp9_sad4x4 mmx sse @@ -509,6 +529,13 @@ specialize vp9_sad8x16x4d sse2 prototype void vp9_sad8x8x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" specialize vp9_sad8x8x4d sse2 +# TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form +prototype void vp9_sad8x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" +specialize vp9_sad8x4x4d + +prototype void vp9_sad4x8x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" +specialize vp9_sad4x8x4d + prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" specialize vp9_sad4x4x4d sse prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse" diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c index 67d1b67..2d3fea9 100644 --- a/vp9/encoder/vp9_onyx_if.c +++ b/vp9/encoder/vp9_onyx_if.c @@ -1597,11 +1597,15 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { vp9_sub_pixel_avg_variance8x8, NULL, NULL, NULL, vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d) - BFP(BLOCK_4X8, NULL, vp9_variance4x8, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL) + BFP(BLOCK_8X4, vp9_sad8x4, vp9_variance8x4, vp9_sub_pixel_variance8x4, + vp9_sub_pixel_avg_variance8x4, NULL, NULL, + NULL, NULL, NULL, + vp9_sad8x4x4d) - BFP(BLOCK_8X4, NULL, vp9_variance8x4, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL) + BFP(BLOCK_4X8, vp9_sad4x8, vp9_variance4x8, vp9_sub_pixel_variance4x8, + vp9_sub_pixel_avg_variance4x8, NULL, NULL, + NULL, NULL, NULL, + vp9_sad4x8x4d) BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4, vp9_sub_pixel_avg_variance4x4, NULL, NULL, NULL, diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 5097664..f928e7a 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -1096,6 +1096,50 @@ static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) { return r; } +static enum BlockSize get_block_size(int bw, int bh) { + if (bw == 4 && bh == 4) + return BLOCK_4X4; + + if (bw == 4 && bh == 8) + return BLOCK_4X8; + + if (bw == 8 && bh == 4) + return BLOCK_8X4; + + if (bw == 8 && bh == 8) + return BLOCK_8X8; + + if (bw == 8 && bh == 16) + return BLOCK_8X16; + + if (bw == 16 && bh == 8) + return BLOCK_16X8; + + if (bw == 16 && bh == 16) + return BLOCK_16X16; + + if (bw == 32 && bh == 32) + return BLOCK_32X32; + + if (bw == 32 && bh == 16) + return BLOCK_32X16; + + if (bw == 16 && bh == 32) + return BLOCK_16X32; + + if (bw == 64 && bh == 32) + return BLOCK_64X32; + + if (bw == 32 && bh == 64) + return BLOCK_32X64; + + if (bw == 64 && bh == 64) + return BLOCK_64X64; + + assert(0); + return -1; +} + static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi, int_mv seg_mvs[4][MAX_REF_FRAMES - 1]) { @@ -1111,6 +1155,10 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, int sbr = 0, sbd = 0; int segmentyrate = 0; int best_eobs[4] = { 0 }; +#if CONFIG_AB4X4 + BLOCK_SIZE_TYPE bsize = mbmi->sb_type; + int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize); +#endif vp9_variance_fn_ptr_t *v_fn_ptr; @@ -1120,7 +1168,11 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, vpx_memcpy(t_above, x->e_mbd.plane[0].above_context, sizeof(t_above)); vpx_memcpy(t_left, x->e_mbd.plane[0].left_context, sizeof(t_left)); +#if CONFIG_AB4X4 + v_fn_ptr = &cpi->fn_ptr[get_block_size(4 << bwl, 4 << bhl)]; +#else v_fn_ptr = &cpi->fn_ptr[BLOCK_4X4]; +#endif // 64 makes this threshold really big effectively // making it so that we very rarely check mvs on @@ -1670,51 +1722,6 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, frame_type, block_size); } - -static enum BlockSize get_block_size(int bw, int bh) { - if (bw == 4 && bh == 4) - return BLOCK_4X4; - - if (bw == 4 && bh == 8) - return BLOCK_4X8; - - if (bw == 8 && bh == 4) - return BLOCK_8X4; - - if (bw == 8 && bh == 8) - return BLOCK_8X8; - - if (bw == 8 && bh == 16) - return BLOCK_8X16; - - if (bw == 16 && bh == 8) - return BLOCK_16X8; - - if (bw == 16 && bh == 16) - return BLOCK_16X16; - - if (bw == 32 && bh == 32) - return BLOCK_32X32; - - if (bw == 32 && bh == 16) - return BLOCK_32X16; - - if (bw == 16 && bh == 32) - return BLOCK_16X32; - - if (bw == 64 && bh == 32) - return BLOCK_64X32; - - if (bw == 32 && bh == 64) - return BLOCK_32X64; - - if (bw == 64 && bh == 64) - return BLOCK_64X64; - - assert(0); - return -1; -} - static void model_rd_from_var_lapndz(int var, int n, int qstep, int *rate, int *dist) { // This function models the rate and distortion for a Laplacian diff --git a/vp9/encoder/vp9_sad_c.c b/vp9/encoder/vp9_sad_c.c index b4cd193..994828f 100644 --- a/vp9/encoder/vp9_sad_c.c +++ b/vp9/encoder/vp9_sad_c.c @@ -156,6 +156,21 @@ unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 16); } +unsigned int vp9_sad8x4_c(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + unsigned int max_sad) { + return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 4); +} + +unsigned int vp9_sad4x8_c(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + unsigned int max_sad) { + return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 8); +} unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int src_stride, @@ -563,6 +578,36 @@ void vp9_sad8x16x4d_c(const uint8_t *src_ptr, ref_ptr[3], ref_stride, 0x7fffffff); } +void vp9_sad8x4x4d_c(const uint8_t *src_ptr, + int src_stride, + const uint8_t* const ref_ptr[], + int ref_stride, + unsigned int *sad_array) { + sad_array[0] = vp9_sad8x4(src_ptr, src_stride, + ref_ptr[0], ref_stride, 0x7fffffff); + sad_array[1] = vp9_sad8x4(src_ptr, src_stride, + ref_ptr[1], ref_stride, 0x7fffffff); + sad_array[2] = vp9_sad8x4(src_ptr, src_stride, + ref_ptr[2], ref_stride, 0x7fffffff); + sad_array[3] = vp9_sad8x4(src_ptr, src_stride, + ref_ptr[3], ref_stride, 0x7fffffff); +} + +void vp9_sad4x8x4d_c(const uint8_t *src_ptr, + int src_stride, + const uint8_t* const ref_ptr[], + int ref_stride, + unsigned int *sad_array) { + sad_array[0] = vp9_sad4x8(src_ptr, src_stride, + ref_ptr[0], ref_stride, 0x7fffffff); + sad_array[1] = vp9_sad4x8(src_ptr, src_stride, + ref_ptr[1], ref_stride, 0x7fffffff); + sad_array[2] = vp9_sad4x8(src_ptr, src_stride, + ref_ptr[2], ref_stride, 0x7fffffff); + sad_array[3] = vp9_sad4x8(src_ptr, src_stride, + ref_ptr[3], ref_stride, 0x7fffffff); +} + void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], diff --git a/vp9/encoder/vp9_variance_c.c b/vp9/encoder/vp9_variance_c.c index fa53abd..e24a46b 100644 --- a/vp9/encoder/vp9_variance_c.c +++ b/vp9/encoder/vp9_variance_c.c @@ -820,3 +820,91 @@ unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, comp_avg_pred(temp3, second_pred, 8, 16, temp2, 8); return vp9_variance8x16_c(temp3, 8, dst_ptr, dst_pixels_per_line, sse); } + +unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const uint8_t *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse) { + uint16_t fdata3[8 * 5]; // Temp data bufffer used in filtering + uint8_t temp2[20 * 16]; + const int16_t *hfilter, *vfilter; + + hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, + 1, 5, 8, hfilter); + var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 4, 8, vfilter); + + return vp9_variance8x4_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse); +} + +unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const uint8_t *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse, + const uint8_t *second_pred) { + uint16_t fdata3[8 * 5]; // Temp data bufffer used in filtering + uint8_t temp2[20 * 16]; + DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 4); // compound pred buffer + const int16_t *hfilter, *vfilter; + + hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, + 1, 5, 8, hfilter); + var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 4, 8, vfilter); + comp_avg_pred(temp3, second_pred, 8, 4, temp2, 8); + return vp9_variance8x4_c(temp3, 8, dst_ptr, dst_pixels_per_line, sse); +} + +unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const uint8_t *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse) { + uint16_t fdata3[5 * 8]; // Temp data bufffer used in filtering + uint8_t temp2[20 * 16]; + const int16_t *hfilter, *vfilter; + + hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, + 1, 17, 4, hfilter); + var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 8, 4, vfilter); + + return vp9_variance4x8_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse); +} + +unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const uint8_t *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse, + const uint8_t *second_pred) { + uint16_t fdata3[5 * 8]; // Temp data bufffer used in filtering + uint8_t temp2[20 * 16]; + DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 8); // compound pred buffer + const int16_t *hfilter, *vfilter; + + hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, + 1, 17, 4, hfilter); + var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 8, 4, vfilter); + comp_avg_pred(temp3, second_pred, 4, 8, temp2, 4); + return vp9_variance4x8_c(temp3, 4, dst_ptr, dst_pixels_per_line, sse); +} -- 2.7.4