From 148eb803bbb3de3a8548445fe50c120ea872a882 Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Thu, 4 Apr 2013 09:56:02 -0700 Subject: [PATCH] Optimize the scaling calculation In decoder, the scaling calculation, such as (mv * x_num / x_den), is fairly time-consuming. In this patch, we check if the scaling happens or not at frame level, and then decide which function to call to skip scaling calculation when no scaling is needed. Tests showed a 3% decoder performance gain. Change-Id: I270901dd0331048e50368cfd51ce273dd82b8733 --- vp9/common/vp9_blockd.h | 11 +++ vp9/common/vp9_reconinter.c | 181 +++++++++++++++++++------------------------- vp9/common/vp9_reconinter.h | 78 ++++++++++++++++++- 3 files changed, 164 insertions(+), 106 deletions(-) diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index b971662..6b90b05 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -322,6 +322,17 @@ struct scale_factors { int y_den; int y_offset_q4; int y_step_q4; + + int (*scale_value_x)(int val, const struct scale_factors *scale); + int (*scale_value_y)(int val, const struct scale_factors *scale); + void (*set_scaled_offsets)(struct scale_factors *scale, int row, int col); + int_mv32 (*scale_motion_vector_q3_to_q4)(const int_mv *src_mv, + const struct scale_factors *scale); + int32_t (*scale_motion_vector_component_q4)(int mv_q4, + int num, + int den, + int offset_q4); + #if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT convolve_fn_t predict[2][2][8]; // horiz, vert, weight (0 - 7) #else diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c index ee378d2..7167811 100644 --- a/vp9/common/vp9_reconinter.c +++ b/vp9/common/vp9_reconinter.c @@ -33,6 +33,24 @@ void vp9_setup_scale_factors_for_frame(struct scale_factors *scale, scale->y_offset_q4 = 0; // calculated per-mb scale->y_step_q4 = 16 * other_h / this_h; + if (scale->x_num == scale->x_den && scale->y_num == scale->y_den) { + scale->scale_value_x = unscaled_value; + scale->scale_value_y = unscaled_value; + scale->set_scaled_offsets = set_offsets_without_scaling; + scale->scale_motion_vector_q3_to_q4 = + motion_vector_q3_to_q4_without_scaling; + scale->scale_motion_vector_component_q4 = + motion_vector_component_q4_without_scaling; + } else { + scale->scale_value_x = scale_value_x_with_scaling; + scale->scale_value_y = scale_value_y_with_scaling; + scale->set_scaled_offsets = set_offsets_with_scaling; + scale->scale_motion_vector_q3_to_q4 = + motion_vector_q3_to_q4_with_scaling; + scale->scale_motion_vector_component_q4 = + motion_vector_component_q4_with_scaling; + } + // TODO(agrange): Investigate the best choice of functions to use here // for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what // to do at full-pel offsets. The current selection, where the filter is @@ -325,60 +343,13 @@ void vp9_copy_mem8x4_c(const uint8_t *src, } } -static void set_scaled_offsets(struct scale_factors *scale, - int row, int col) { - const int x_q4 = 16 * col; - const int y_q4 = 16 * row; - - scale->x_offset_q4 = (x_q4 * scale->x_num / scale->x_den) & 0xf; - scale->y_offset_q4 = (y_q4 * scale->y_num / scale->y_den) & 0xf; -} - -static int32_t scale_motion_vector_component_q3(int mv_q3, - int num, - int den, - int offset_q4) { - // returns the scaled and offset value of the mv component. - const int32_t mv_q4 = mv_q3 << 1; - - /* TODO(jkoleszar): make fixed point, or as a second multiply? */ - return mv_q4 * num / den + offset_q4; -} - -static int32_t scale_motion_vector_component_q4(int mv_q4, - int num, - int den, - int offset_q4) { - // returns the scaled and offset value of the mv component. - - /* TODO(jkoleszar): make fixed point, or as a second multiply? */ - return mv_q4 * num / den + offset_q4; -} - -static int_mv32 scale_motion_vector_q3_to_q4( - const int_mv *src_mv, - const struct scale_factors *scale) { - // returns mv * scale + offset - int_mv32 result; - - result.as_mv.row = scale_motion_vector_component_q3(src_mv->as_mv.row, - scale->y_num, - scale->y_den, - scale->y_offset_q4); - result.as_mv.col = scale_motion_vector_component_q3(src_mv->as_mv.col, - scale->x_num, - scale->x_den, - scale->x_offset_q4); - return result; -} - void vp9_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int_mv *mv_q3, const struct scale_factors *scale, int w, int h, int weight, const struct subpix_fn_table *subpix) { - int_mv32 mv = scale_motion_vector_q3_to_q4(mv_q3, scale); + int_mv32 mv = scale->scale_motion_vector_q3_to_q4(mv_q3, scale); src += (mv.as_mv.row >> 4) * src_stride + (mv.as_mv.col >> 4); scale->predict[!!(mv.as_mv.col & 15)][!!(mv.as_mv.row & 15)][weight]( src, src_stride, dst, dst_stride, @@ -402,11 +373,11 @@ void vp9_build_inter_predictor_q4(const uint8_t *src, int src_stride, const int mv_col_q4 = ((fullpel_mv_q3->as_mv.col >> 3) << 4) + (frac_mv_q4->as_mv.col & 0xf); const int scaled_mv_row_q4 = - scale_motion_vector_component_q4(mv_row_q4, scale->y_num, scale->y_den, - scale->y_offset_q4); + scale->scale_motion_vector_component_q4(mv_row_q4, scale->y_num, + scale->y_den, scale->y_offset_q4); const int scaled_mv_col_q4 = - scale_motion_vector_component_q4(mv_col_q4, scale->x_num, scale->x_den, - scale->x_offset_q4); + scale->scale_motion_vector_component_q4(mv_col_q4, scale->x_num, + scale->x_den, scale->x_offset_q4); const int subpel_x = scaled_mv_col_q4 & 15; const int subpel_y = scaled_mv_row_q4 & 15; @@ -419,17 +390,19 @@ void vp9_build_inter_predictor_q4(const uint8_t *src, int src_stride, } static void build_2x1_inter_predictor_wh(const BLOCKD *d0, const BLOCKD *d1, - struct scale_factors *scale, + struct scale_factors *s, uint8_t *predictor, int block_size, int stride, int which_mv, int weight, int width, int height, const struct subpix_fn_table *subpix, int row, int col) { + struct scale_factors * scale = &s[which_mv]; + assert(d1->predictor - d0->predictor == block_size); assert(d1->pre == d0->pre + block_size); - set_scaled_offsets(&scale[which_mv], row, col); + scale->set_scaled_offsets(scale, row, col); if (d0->bmi.as_mv[which_mv].as_int == d1->bmi.as_mv[which_mv].as_int) { uint8_t **base_pre = which_mv ? d0->base_second_pre : d0->base_pre; @@ -438,7 +411,7 @@ static void build_2x1_inter_predictor_wh(const BLOCKD *d0, const BLOCKD *d1, d0->pre_stride, predictor, stride, &d0->bmi.as_mv[which_mv], - &scale[which_mv], + scale, width, height, weight, subpix); @@ -450,37 +423,39 @@ static void build_2x1_inter_predictor_wh(const BLOCKD *d0, const BLOCKD *d1, d0->pre_stride, predictor, stride, &d0->bmi.as_mv[which_mv], - &scale[which_mv], + scale, width > block_size ? block_size : width, height, weight, subpix); if (width <= block_size) return; - set_scaled_offsets(&scale[which_mv], row, col + block_size); + scale->set_scaled_offsets(scale, row, col + block_size); vp9_build_inter_predictor(*base_pre1 + d1->pre, d1->pre_stride, predictor + block_size, stride, &d1->bmi.as_mv[which_mv], - &scale[which_mv], + scale, width - block_size, height, weight, subpix); } } static void build_2x1_inter_predictor(const BLOCKD *d0, const BLOCKD *d1, - struct scale_factors *scale, + struct scale_factors *s, int block_size, int stride, int which_mv, int weight, const struct subpix_fn_table *subpix, int row, int col, int use_dst) { uint8_t *d0_predictor = use_dst ? *(d0->base_dst) + d0->dst : d0->predictor; uint8_t *d1_predictor = use_dst ? *(d1->base_dst) + d1->dst : d1->predictor; + struct scale_factors * scale = &s[which_mv]; stride = use_dst ? d0->dst_stride : stride; + assert(d1_predictor - d0_predictor == block_size); assert(d1->pre == d0->pre + block_size); - set_scaled_offsets(&scale[which_mv], row, col); + scale->set_scaled_offsets(scale, row, col); if (d0->bmi.as_mv[which_mv].as_int == d1->bmi.as_mv[which_mv].as_int) { uint8_t **base_pre = which_mv ? d0->base_second_pre : d0->base_pre; @@ -489,7 +464,7 @@ static void build_2x1_inter_predictor(const BLOCKD *d0, const BLOCKD *d1, d0->pre_stride, d0_predictor, stride, &d0->bmi.as_mv[which_mv], - &scale[which_mv], + scale, 2 * block_size, block_size, weight, subpix); } else { @@ -500,17 +475,17 @@ static void build_2x1_inter_predictor(const BLOCKD *d0, const BLOCKD *d1, d0->pre_stride, d0_predictor, stride, &d0->bmi.as_mv[which_mv], - &scale[which_mv], + scale, block_size, block_size, weight, subpix); - set_scaled_offsets(&scale[which_mv], row, col + block_size); + scale->set_scaled_offsets(scale, row, col + block_size); vp9_build_inter_predictor(*base_pre1 + d1->pre, d1->pre_stride, d1_predictor, stride, &d1->bmi.as_mv[which_mv], - &scale[which_mv], + scale, block_size, block_size, weight, subpix); } @@ -774,6 +749,7 @@ static int get_implicit_compoundinter_weight(MACROBLOCKD *xd, int weight; int edge[4]; int block_size = 16 << xd->mode_info_context->mbmi.sb_type; + struct scale_factors *scale; if (!use_second_ref) return 0; if (!(xd->up_available || xd->left_available)) @@ -789,17 +765,17 @@ static int get_implicit_compoundinter_weight(MACROBLOCKD *xd, pre_stride = xd->second_pre.y_stride; ymv.as_int = xd->mode_info_context->mbmi.mv[1].as_int; // First generate the second predictor + scale = &xd->scale_factor[1]; for (n = 0; n < block_size; n += 16) { xd->mb_to_left_edge = edge[2] - (n << 3); xd->mb_to_right_edge = edge[3] + ((16 - n) << 3); if (clamp_mvs) clamp_mv_to_umv_border(&ymv.as_mv, xd); - set_scaled_offsets(&xd->scale_factor[1], mb_row * 16, mb_col * 16 + n); + scale->set_scaled_offsets(scale, mb_row * 16, mb_col * 16 + n); // predict a single row of pixels - vp9_build_inter_predictor( - base_pre + scaled_buffer_offset(n, 0, pre_stride, &xd->scale_factor[1]), - pre_stride, tmp_y + n, tmp_ystride, &ymv, &xd->scale_factor[1], - 16, 1, 0, &xd->subpix); + vp9_build_inter_predictor(base_pre + + scaled_buffer_offset(n, 0, pre_stride, scale), + pre_stride, tmp_y + n, tmp_ystride, &ymv, scale, 16, 1, 0, &xd->subpix); } xd->mb_to_left_edge = edge[2]; xd->mb_to_right_edge = edge[3]; @@ -808,12 +784,12 @@ static int get_implicit_compoundinter_weight(MACROBLOCKD *xd, xd->mb_to_bottom_edge = edge[1] + ((16 - n) << 3); if (clamp_mvs) clamp_mv_to_umv_border(&ymv.as_mv, xd); - set_scaled_offsets(&xd->scale_factor[1], mb_row * 16 + n, mb_col * 16); + scale->set_scaled_offsets(scale, mb_row * 16 + n, mb_col * 16); // predict a single col of pixels - vp9_build_inter_predictor( - base_pre + scaled_buffer_offset(0, n, pre_stride, &xd->scale_factor[1]), + vp9_build_inter_predictor(base_pre + + scaled_buffer_offset(0, n, pre_stride, scale), pre_stride, tmp_y + n * tmp_ystride, tmp_ystride, &ymv, - &xd->scale_factor[1], 1, 16, 0, &xd->subpix); + scale, 1, 16, 0, &xd->subpix); } xd->mb_to_top_edge = edge[0]; xd->mb_to_bottom_edge = edge[1]; @@ -825,17 +801,17 @@ static int get_implicit_compoundinter_weight(MACROBLOCKD *xd, pre_stride = xd->pre.y_stride; ymv.as_int = xd->mode_info_context->mbmi.mv[0].as_int; // Now generate the first predictor + scale = &xd->scale_factor[0]; for (n = 0; n < block_size; n += 16) { xd->mb_to_left_edge = edge[2] - (n << 3); xd->mb_to_right_edge = edge[3] + ((16 - n) << 3); if (clamp_mvs) clamp_mv_to_umv_border(&ymv.as_mv, xd); - set_scaled_offsets(&xd->scale_factor[0], mb_row * 16, mb_col * 16 + n); + scale->set_scaled_offsets(scale, mb_row * 16, mb_col * 16 + n); // predict a single row of pixels - vp9_build_inter_predictor( - base_pre + scaled_buffer_offset(n, 0, pre_stride, &xd->scale_factor[0]), - pre_stride, tmp_y + n, tmp_ystride, &ymv, &xd->scale_factor[0], - 16, 1, 0, &xd->subpix); + vp9_build_inter_predictor(base_pre + + scaled_buffer_offset(n, 0, pre_stride, scale), + pre_stride, tmp_y + n, tmp_ystride, &ymv, scale, 16, 1, 0, &xd->subpix); } xd->mb_to_left_edge = edge[2]; xd->mb_to_right_edge = edge[3]; @@ -844,12 +820,12 @@ static int get_implicit_compoundinter_weight(MACROBLOCKD *xd, xd->mb_to_bottom_edge = edge[1] + ((16 - n) << 3); if (clamp_mvs) clamp_mv_to_umv_border(&ymv.as_mv, xd); - set_scaled_offsets(&xd->scale_factor[0], mb_row * 16 + n, mb_col * 16); + scale->set_scaled_offsets(scale, mb_row * 16 + n, mb_col * 16); // predict a single col of pixels - vp9_build_inter_predictor( - base_pre + scaled_buffer_offset(0, n, pre_stride, &xd->scale_factor[0]), + vp9_build_inter_predictor(base_pre + + scaled_buffer_offset(0, n, pre_stride, scale), pre_stride, tmp_y + n * tmp_ystride, tmp_ystride, &ymv, - &xd->scale_factor[0], 1, 16, 0, &xd->subpix); + scale, 1, 16, 0, &xd->subpix); } xd->mb_to_top_edge = edge[0]; xd->mb_to_bottom_edge = edge[1]; @@ -877,17 +853,18 @@ static void build_inter16x16_predictors_mby_w(MACROBLOCKD *xd, uint8_t *base_pre = which_mv ? xd->second_pre.y_buffer : xd->pre.y_buffer; int pre_stride = which_mv ? xd->second_pre.y_stride : xd->pre.y_stride; int_mv ymv; + struct scale_factors *scale = &xd->scale_factor[which_mv]; + ymv.as_int = xd->mode_info_context->mbmi.mv[which_mv].as_int; if (clamp_mvs) clamp_mv_to_umv_border(&ymv.as_mv, xd); - set_scaled_offsets(&xd->scale_factor[which_mv], mb_row * 16, mb_col * 16); + scale->set_scaled_offsets(scale, mb_row * 16, mb_col * 16); - vp9_build_inter_predictor(base_pre, pre_stride, - dst_y, dst_ystride, - &ymv, &xd->scale_factor[which_mv], - 16, 16, which_mv ? weight : 0, &xd->subpix); + vp9_build_inter_predictor(base_pre, pre_stride, dst_y, dst_ystride, + &ymv, scale, 16, 16, + which_mv ? weight : 0, &xd->subpix); } } @@ -920,17 +897,17 @@ void vp9_build_inter16x16_predictors_mby(MACROBLOCKD *xd, uint8_t *base_pre = which_mv ? xd->second_pre.y_buffer : xd->pre.y_buffer; int pre_stride = which_mv ? xd->second_pre.y_stride : xd->pre.y_stride; int_mv ymv; + struct scale_factors *scale = &xd->scale_factor[which_mv]; + ymv.as_int = xd->mode_info_context->mbmi.mv[which_mv].as_int; if (clamp_mvs) clamp_mv_to_umv_border(&ymv.as_mv, xd); - set_scaled_offsets(&xd->scale_factor[which_mv], mb_row * 16, mb_col * 16); + scale->set_scaled_offsets(scale, mb_row * 16, mb_col * 16); - vp9_build_inter_predictor(base_pre, pre_stride, - dst_y, dst_ystride, - &ymv, &xd->scale_factor[which_mv], - 16, 16, which_mv, &xd->subpix); + vp9_build_inter_predictor(base_pre, pre_stride, dst_y, dst_ystride, + &ymv, scale, 16, 16, which_mv, &xd->subpix); } } #endif @@ -956,6 +933,8 @@ static void build_inter16x16_predictors_mbuv_w(MACROBLOCKD *xd, int_mv _o16x16mv; int_mv _16x16mv; + struct scale_factors *scale = &xd->scale_factor_uv[which_mv]; + _16x16mv.as_int = xd->mode_info_context->mbmi.mv[which_mv].as_int; if (clamp_mvs) @@ -979,18 +958,15 @@ static void build_inter16x16_predictors_mbuv_w(MACROBLOCKD *xd, uptr = (which_mv ? xd->second_pre.u_buffer : xd->pre.u_buffer); vptr = (which_mv ? xd->second_pre.v_buffer : xd->pre.v_buffer); - set_scaled_offsets(&xd->scale_factor_uv[which_mv], - mb_row * 16, mb_col * 16); + scale->set_scaled_offsets(scale, mb_row * 16, mb_col * 16); vp9_build_inter_predictor_q4( uptr, pre_stride, dst_u, dst_uvstride, &_16x16mv, &_o16x16mv, - &xd->scale_factor_uv[which_mv], 8, 8, - which_mv ? weight : 0, &xd->subpix); + scale, 8, 8, which_mv ? weight : 0, &xd->subpix); vp9_build_inter_predictor_q4( vptr, pre_stride, dst_v, dst_uvstride, &_16x16mv, &_o16x16mv, - &xd->scale_factor_uv[which_mv], 8, 8, - which_mv ? weight : 0, &xd->subpix); + scale, 8, 8, which_mv ? weight : 0, &xd->subpix); } } @@ -1030,6 +1006,8 @@ void vp9_build_inter16x16_predictors_mbuv(MACROBLOCKD *xd, int_mv _o16x16mv; int_mv _16x16mv; + struct scale_factors *scale = &xd->scale_factor_uv[which_mv]; + _16x16mv.as_int = xd->mode_info_context->mbmi.mv[which_mv].as_int; if (clamp_mvs) @@ -1053,17 +1031,16 @@ void vp9_build_inter16x16_predictors_mbuv(MACROBLOCKD *xd, uptr = (which_mv ? xd->second_pre.u_buffer : xd->pre.u_buffer); vptr = (which_mv ? xd->second_pre.v_buffer : xd->pre.v_buffer); - set_scaled_offsets(&xd->scale_factor_uv[which_mv], - mb_row * 16, mb_col * 16); + scale->set_scaled_offsets(scale, mb_row * 16, mb_col * 16); vp9_build_inter_predictor_q4( uptr, pre_stride, dst_u, dst_uvstride, &_16x16mv, &_o16x16mv, - &xd->scale_factor_uv[which_mv], 8, 8, + scale, 8, 8, which_mv << (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT), &xd->subpix); vp9_build_inter_predictor_q4( vptr, pre_stride, dst_v, dst_uvstride, &_16x16mv, &_o16x16mv, - &xd->scale_factor_uv[which_mv], 8, 8, + scale, 8, 8, which_mv << (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT), &xd->subpix); } } diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h index 585fcfd..068853d 100644 --- a/vp9/common/vp9_reconinter.h +++ b/vp9/common/vp9_reconinter.h @@ -77,20 +77,27 @@ void vp9_build_inter_predictor_q4(const uint8_t *src, int src_stride, int w, int h, int do_avg, const struct subpix_fn_table *subpix); -static int scale_value_x(int val, const struct scale_factors *scale) { +static int scale_value_x_with_scaling(int val, + const struct scale_factors *scale) { return val * scale->x_num / scale->x_den; } -static int scale_value_y(int val, const struct scale_factors *scale) { +static int scale_value_y_with_scaling(int val, + const struct scale_factors *scale) { return val * scale->y_num / scale->y_den; } +static int unscaled_value(int val, const struct scale_factors *scale) { + (void) scale; + return val; +} + static int scaled_buffer_offset(int x_offset, int y_offset, int stride, const struct scale_factors *scale) { - return scale_value_y(y_offset, scale) * stride + - scale_value_x(x_offset, scale); + return scale->scale_value_y(y_offset, scale) * stride + + scale->scale_value_x(x_offset, scale); } static void setup_pred_block(YV12_BUFFER_CONFIG *dst, @@ -112,6 +119,7 @@ static void setup_pred_block(YV12_BUFFER_CONFIG *dst, recon_yoffset = 16 * mb_row * recon_y_stride + 16 * mb_col; recon_uvoffset = 8 * mb_row * recon_uv_stride + 8 * mb_col; } + *dst = *src; dst->y_buffer += recon_yoffset; dst->u_buffer += recon_uvoffset; @@ -128,4 +136,66 @@ static void set_scale_factors(MACROBLOCKD *xd, xd->scale_factor_uv[1] = xd->scale_factor[1]; } +static void set_offsets_with_scaling(struct scale_factors *scale, + int row, int col) { + const int x_q4 = 16 * col; + const int y_q4 = 16 * row; + + scale->x_offset_q4 = (x_q4 * scale->x_num / scale->x_den) & 0xf; + scale->y_offset_q4 = (y_q4 * scale->y_num / scale->y_den) & 0xf; +} + +static void set_offsets_without_scaling(struct scale_factors *scale, + int row, int col) { + scale->x_offset_q4 = 0; + scale->y_offset_q4 = 0; +} + +static int_mv32 motion_vector_q3_to_q4_with_scaling( + const int_mv *src_mv, + const struct scale_factors *scale) { + // returns mv * scale + offset + int_mv32 result; + const int32_t mv_row_q4 = src_mv->as_mv.row << 1; + const int32_t mv_col_q4 = src_mv->as_mv.col << 1; + + /* TODO(jkoleszar): make fixed point, or as a second multiply? */ + result.as_mv.row = mv_row_q4 * scale->y_num / scale->y_den + + scale->y_offset_q4; + result.as_mv.col = mv_col_q4 * scale->x_num / scale->x_den + + scale->x_offset_q4; + return result; +} + +static int_mv32 motion_vector_q3_to_q4_without_scaling( + const int_mv *src_mv, + const struct scale_factors *scale) { + // returns mv * scale + offset + int_mv32 result; + + result.as_mv.row = src_mv->as_mv.row << 1; + result.as_mv.col = src_mv->as_mv.col << 1; + return result; +} + +static int32_t motion_vector_component_q4_with_scaling(int mv_q4, + int num, + int den, + int offset_q4) { + // returns the scaled and offset value of the mv component. + + /* TODO(jkoleszar): make fixed point, or as a second multiply? */ + return mv_q4 * num / den + offset_q4; +} + +static int32_t motion_vector_component_q4_without_scaling(int mv_q4, + int num, + int den, + int offset_q4) { + // returns the scaled and offset value of the mv component. + (void)num; + (void)den; + (void)offset_q4; + return mv_q4; +} #endif // VP9_COMMON_VP9_RECONINTER_H_ -- 2.7.4