From 987ed6937bf27ec5a4e4cb136aa653104adfb068 Mon Sep 17 00:00:00 2001 From: Deepa K G Date: Mon, 3 Apr 2023 23:21:56 +0530 Subject: [PATCH] Avoid redundant start MV SAD calculation Avoided repeated calculation of start MV SAD during full pixel motion search. Instruction Count cpu Resolution Reduction(%) 0 LOWRES2 0.162 0 MIDRES2 0.246 0 HDRES2 0.325 0 Average 0.245 Change-Id: I2b4786901f254ce32ee8ca8a3d56f1c9f112f1d4 --- vp9/common/vp9_rtcd_defs.pl | 2 +- vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c | 40 ++++-------------- vp9/encoder/vp9_encoder.h | 34 +++++++++++++++ vp9/encoder/vp9_firstpass.c | 18 +++++--- vp9/encoder/vp9_mcomp.c | 48 ++++++++-------------- vp9/encoder/vp9_mcomp.h | 6 +-- vp9/encoder/x86/vp9_diamond_search_sad_avx.c | 40 ++++-------------- 7 files changed, 82 insertions(+), 106 deletions(-) diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 7f77a36..5e60792 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -171,7 +171,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") { # # Motion search # -add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv"; +add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv"; specialize qw/vp9_diamond_search_sad avx neon/; # diff --git a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c index 997775a..15334b4 100644 --- a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c +++ b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c @@ -30,30 +30,6 @@ static INLINE int_mv pack_int_mv(int16_t row, int16_t col) { return result; } -static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) { - // This is simplified from the C implementation to utilise that - // x->nmvjointsadcost[1] == x->nmvjointsadcost[2] and - // x->nmvjointsadcost[1] == x->nmvjointsadcost[3] - return mv.as_int == 0 ? 0 : 1; -} - -static INLINE int mv_cost(const int_mv mv, const int *joint_cost, - int *const comp_cost[2]) { - assert(mv.as_mv.row >= -MV_MAX && mv.as_mv.row < MV_MAX); - assert(mv.as_mv.col >= -MV_MAX && mv.as_mv.col < MV_MAX); - return joint_cost[get_mv_joint(mv)] + comp_cost[0][mv.as_mv.row] + - comp_cost[1][mv.as_mv.col]; -} - -static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref, - int sad_per_bit) { - const int_mv diff = - pack_int_mv(mv.as_mv.row - ref->row, mv.as_mv.col - ref->col); - return ROUND_POWER_OF_TWO( - (unsigned)mv_cost(diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit, - VP9_PROB_COST_SHIFT); -} - /***************************************************************************** * This function utilizes 3 properties of the cost function lookup tables, * * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in * @@ -71,8 +47,9 @@ static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref, *****************************************************************************/ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, - MV *best_mv, int search_param, int sad_per_bit, - int *num00, const vp9_variance_fn_ptr_t *fn_ptr, + uint32_t start_mv_sad, MV *best_mv, + int search_param, int sad_per_bit, int *num00, + const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv) { static const uint32_t data[4] = { 0, 1, 2, 3 }; const uint32x4_t v_idx_d = vld1q_u32((const uint32_t *)data); @@ -101,8 +78,8 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, pack_int_mv(center_mv->row >> 3, center_mv->col >> 3); const int16x8_t vfcmv = vreinterpretq_s16_s32(vdupq_n_s32(fcenter_mv.as_int)); - const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row); - const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col); + const int ref_row = ref_mv->row; + const int ref_col = ref_mv->col; int_mv bmv = pack_int_mv(ref_row, ref_col); int_mv new_bmv = bmv; @@ -122,7 +99,8 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, #else int32x4_t v_ba_d = vdupq_n_s32((intptr_t)best_address); #endif - unsigned int best_sad = INT_MAX; + // Starting position + unsigned int best_sad = start_mv_sad; int i, j, step; // Check the prerequisite cost function properties that are easy to check @@ -131,10 +109,6 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]); assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]); - // Check the starting position - best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride); - best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit); - *num00 = 0; for (i = 0, step = 0; step < tot_steps; step++) { diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 442ef18..9e5e646 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -1479,6 +1479,40 @@ static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) { } } +static INLINE int mv_cost(const MV *mv, const int *joint_cost, + int *const comp_cost[2]) { + assert(mv->row >= -MV_MAX && mv->row < MV_MAX); + assert(mv->col >= -MV_MAX && mv->col < MV_MAX); + return joint_cost[vp9_get_mv_joint(mv)] + comp_cost[0][mv->row] + + comp_cost[1][mv->col]; +} + +static INLINE int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, + const MV *ref, int sad_per_bit) { + MV diff; + diff.row = mv->row - ref->row; + diff.col = mv->col - ref->col; + return ROUND_POWER_OF_TWO( + (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit, + VP9_PROB_COST_SHIFT); +} + +static INLINE uint32_t get_start_mv_sad(const MACROBLOCK *x, const MV *mvp_full, + const MV *ref_mv_full, + vpx_sad_fn_t sad_fn_ptr, int sadpb) { + const int src_buf_stride = x->plane[0].src.stride; + const uint8_t *const src_buf = x->plane[0].src.buf; + const MACROBLOCKD *const xd = &x->e_mbd; + const int pred_buf_stride = xd->plane[0].pre[0].stride; + const uint8_t *const pred_buf = + xd->plane[0].pre[0].buf + mvp_full->row * pred_buf_stride + mvp_full->col; + uint32_t start_mv_sad = + sad_fn_ptr(src_buf, src_buf_stride, pred_buf, pred_buf_stride); + start_mv_sad += mvsad_err_cost(x, mvp_full, ref_mv_full, sadpb); + + return start_mv_sad; +} + static INLINE int num_4x4_to_edge(int plane_4x4_dim, int mb_to_edge_dim, int subsampling_dim, int blk_dim) { return plane_4x4_dim + (mb_to_edge_dim >> (5 + subsampling_dim)) - blk_dim; diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 08b68c9..0efa836 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -435,6 +435,8 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize = xd->mi[0]->sb_type; vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize]; const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY; + MV center_mv_full = ref_mv_full; + unsigned int start_mv_sad; int step_param = 3; int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; @@ -455,9 +457,15 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, } #endif // CONFIG_VP9_HIGHBITDEPTH + // Calculate SAD of the start mv + clamp_mv(&ref_mv_full, x->mv_limits.col_min, x->mv_limits.col_max, + x->mv_limits.row_min, x->mv_limits.row_max); + start_mv_sad = get_start_mv_sad(x, &ref_mv_full, ¢er_mv_full, + cpi->fn_ptr[bsize].sdf, x->sadperbit16); + // Center the initial step/diamond search on best mv. - tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv, - step_param, x->sadperbit16, &num00, + tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad, + &tmp_mv, step_param, x->sadperbit16, &num00, &v_fn_ptr, ref_mv); if (tmp_err < INT_MAX) tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1); @@ -478,9 +486,9 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, if (num00) { --num00; } else { - tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv, - step_param + n, x->sadperbit16, &num00, - &v_fn_ptr, ref_mv); + tmp_err = cpi->diamond_search_sad( + x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad, &tmp_mv, step_param + n, + x->sadperbit16, &num00, &v_fn_ptr, ref_mv); if (tmp_err < INT_MAX) tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1); if (tmp_err < INT_MAX - new_mv_mode_penalty) diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 207eb43..4ff685b 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -77,14 +77,6 @@ int vp9_init_search_range(int size) { return sr; } -static INLINE int mv_cost(const MV *mv, const int *joint_cost, - int *const comp_cost[2]) { - assert(mv->row >= -MV_MAX && mv->row < MV_MAX); - assert(mv->col >= -MV_MAX && mv->col < MV_MAX); - return joint_cost[vp9_get_mv_joint(mv)] + comp_cost[0][mv->row] + - comp_cost[1][mv->col]; -} - int vp9_mv_bit_cost(const MV *mv, const MV *ref, const int *mvjcost, int *mvcost[2], int weight) { const MV diff = { mv->row - ref->row, mv->col - ref->col }; @@ -103,15 +95,6 @@ static int mv_err_cost(const MV *mv, const MV *ref, const int *mvjcost, } return 0; } - -static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref, - int sad_per_bit) { - const MV diff = { mv->row - ref->row, mv->col - ref->col }; - return ROUND_POWER_OF_TWO( - (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit, - VP9_PROB_COST_SHIFT); -} - void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride) { int len; int ss_count = 0; @@ -2070,8 +2053,8 @@ int vp9_prepare_nb_full_mvs(const MotionField *motion_field, int mi_row, #endif // CONFIG_NON_GREEDY_MV int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg, - MV *ref_mv, MV *best_mv, int search_param, - int sad_per_bit, int *num00, + MV *ref_mv, uint32_t start_mv_sad, MV *best_mv, + int search_param, int sad_per_bit, int *num00, const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv) { int i, j, step; @@ -2083,7 +2066,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg, const int in_what_stride = xd->plane[0].pre[0].stride; const uint8_t *best_address; - unsigned int bestsad = INT_MAX; + unsigned int bestsad = start_mv_sad; int best_site = -1; int last_site = -1; @@ -2101,8 +2084,6 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg, const int tot_steps = cfg->total_steps - search_param; const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; - clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max, - x->mv_limits.row_min, x->mv_limits.row_max); ref_row = ref_mv->row; ref_col = ref_mv->col; *num00 = 0; @@ -2113,10 +2094,6 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg, in_what = xd->plane[0].pre[0].buf + ref_row * in_what_stride + ref_col; best_address = in_what; - // Check the starting position - bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride) + - mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit); - i = 0; for (step = 0; step < tot_steps; step++) { @@ -2514,8 +2491,17 @@ static int full_pixel_diamond(const VP9_COMP *const cpi, const MV *ref_mv, MV *dst_mv) { MV temp_mv; int thissme, n, num00 = 0; - int bestsme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv, - step_param, sadpb, &n, fn_ptr, ref_mv); + int bestsme; + unsigned int start_mv_sad; + const MV ref_mv_full = { ref_mv->row >> 3, ref_mv->col >> 3 }; + clamp_mv(mvp_full, x->mv_limits.col_min, x->mv_limits.col_max, + x->mv_limits.row_min, x->mv_limits.row_max); + start_mv_sad = + get_start_mv_sad(x, mvp_full, &ref_mv_full, fn_ptr->sdf, sadpb); + + bestsme = + cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad, &temp_mv, + step_param, sadpb, &n, fn_ptr, ref_mv); if (bestsme < INT_MAX) bestsme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1); *dst_mv = temp_mv; @@ -2530,9 +2516,9 @@ static int full_pixel_diamond(const VP9_COMP *const cpi, if (num00) { num00--; } else { - thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv, - step_param + n, sadpb, &num00, fn_ptr, - ref_mv); + thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad, + &temp_mv, step_param + n, sadpb, &num00, + fn_ptr, ref_mv); if (thissme < INT_MAX) thissme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1); diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h index bdaf2ce..62a7a04 100644 --- a/vp9/encoder/vp9_mcomp.h +++ b/vp9/encoder/vp9_mcomp.h @@ -94,9 +94,9 @@ extern fractional_mv_step_fp vp9_return_max_sub_pixel_mv; extern fractional_mv_step_fp vp9_return_min_sub_pixel_mv; typedef int (*vp9_diamond_search_fn_t)( - const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv, - int search_param, int sad_per_bit, int *num00, - const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv); + const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, + uint32_t start_mv_sad, MV *best_mv, int search_param, int sad_per_bit, + int *num00, const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv); int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, int search_range, diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c index 0e04a2f..719ab40 100644 --- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c +++ b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c @@ -32,29 +32,6 @@ static INLINE int_mv pack_int_mv(int16_t row, int16_t col) { result.as_mv.col = col; return result; } - -static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) { - // This is simplified from the C implementation to utilise that - // x->nmvjointsadcost[1] == x->nmvjointsadcost[2] and - // x->nmvjointsadcost[1] == x->nmvjointsadcost[3] - return mv.as_int == 0 ? 0 : 1; -} - -static INLINE int mv_cost(const int_mv mv, const int *joint_cost, - int *const comp_cost[2]) { - return joint_cost[get_mv_joint(mv)] + comp_cost[0][mv.as_mv.row] + - comp_cost[1][mv.as_mv.col]; -} - -static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref, - int sad_per_bit) { - const int_mv diff = - pack_int_mv(mv.as_mv.row - ref->row, mv.as_mv.col - ref->col); - return ROUND_POWER_OF_TWO( - (unsigned)mv_cost(diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit, - VP9_PROB_COST_SHIFT); -} - /***************************************************************************** * This function utilizes 3 properties of the cost function lookup tables, * * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in * @@ -72,8 +49,9 @@ static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref, *****************************************************************************/ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, - MV *best_mv, int search_param, int sad_per_bit, - int *num00, const vp9_variance_fn_ptr_t *fn_ptr, + uint32_t start_mv_sad, MV *best_mv, + int search_param, int sad_per_bit, int *num00, + const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv) { const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max); const __m128i v_max_mv_w = _mm_set1_epi32((int)maxmv.as_int); @@ -98,8 +76,8 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, pack_int_mv(center_mv->row >> 3, center_mv->col >> 3); const __m128i vfcmv = _mm_set1_epi32((int)fcenter_mv.as_int); - const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row); - const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col); + const int ref_row = ref_mv->row; + const int ref_col = ref_mv->col; int_mv bmv = pack_int_mv(ref_row, ref_col); int_mv new_bmv = bmv; @@ -119,8 +97,8 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, #else __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address); #endif - - unsigned int best_sad; + // Starting position + unsigned int best_sad = start_mv_sad; int i, j, step; // Check the prerequisite cost function properties that are easy to check @@ -129,10 +107,6 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]); assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]); - // Check the starting position - best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride); - best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit); - *num00 = 0; for (i = 0, step = 0; step < tot_steps; step++) { -- 2.7.4