From 1ba91a84ad1e1d2486e2aa68822ed147084cd4c8 Mon Sep 17 00:00:00 2001 From: Deb Mukherjee Date: Wed, 7 Aug 2013 17:01:43 -0700 Subject: [PATCH] Adds a new subpel motion function Adds a new subpel motion estimation function that uses a 2-level tree-structured decision tree to eliminate redundant computations. It searches fewer points than iterative search (which can search the same point multiple times) but has the same quality roughly. This is made the default setting at speeds 0 and 1, while at speed 2 and above only a 1-level search is used. Also includes various cleanups for consistency and redundancy removal. Results: derf: +0.012% psnr stdhd: +0.09% psnr Speedup of about 2-3% Change-Id: Iedde4866f5475586dea0f0ba4cb7428fba24eee9 --- vp9/encoder/vp9_mcomp.c | 421 ++++++++++++++++++++++++++++++--------------- vp9/encoder/vp9_mcomp.h | 27 +-- vp9/encoder/vp9_onyx_if.c | 13 +- vp9/encoder/vp9_onyx_int.h | 2 + vp9/encoder/vp9_rdopt.c | 2 +- 5 files changed, 305 insertions(+), 160 deletions(-) diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 014f54a..96e66f0 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -245,6 +245,71 @@ void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) { }, \ v = INT_MAX;) +#define FIRST_LEVEL_CHECKS \ + { \ + unsigned int left, right, up, down, diag; \ + CHECK_BETTER(left, tr, tc - hstep); \ + CHECK_BETTER(right, tr, tc + hstep); \ + CHECK_BETTER(up, tr - hstep, tc); \ + CHECK_BETTER(down, tr + hstep, tc); \ + whichdir = (left < right ? 0 : 1) + \ + (up < down ? 0 : 2); \ + switch (whichdir) { \ + case 0: \ + CHECK_BETTER(diag, tr - hstep, tc - hstep); \ + break; \ + case 1: \ + CHECK_BETTER(diag, tr - hstep, tc + hstep); \ + break; \ + case 2: \ + CHECK_BETTER(diag, tr + hstep, tc - hstep); \ + break; \ + case 3: \ + CHECK_BETTER(diag, tr + hstep, tc + hstep); \ + break; \ + } \ + } + +#define SECOND_LEVEL_CHECKS \ + { \ + int kr, kc; \ + unsigned int second; \ + if (tr != br && tc != bc) { \ + kr = br - tr; \ + kc = bc - tc; \ + CHECK_BETTER(second, tr + kr, tc + 2 * kc); \ + CHECK_BETTER(second, tr + 2 * kr, tc + kc); \ + } else if (tr == br && tc != bc) { \ + kc = bc - tc; \ + CHECK_BETTER(second, tr + hstep, tc + 2 * kc); \ + CHECK_BETTER(second, tr - hstep, tc + 2 * kc); \ + switch (whichdir) { \ + case 0: \ + case 1: \ + CHECK_BETTER(second, tr + hstep, tc + kc); \ + break; \ + case 2: \ + case 3: \ + CHECK_BETTER(second, tr - hstep, tc + kc); \ + break; \ + } \ + } else if (tr != br && tc == bc) { \ + kr = br - tr; \ + CHECK_BETTER(second, tr + 2 * kr, tc + hstep); \ + CHECK_BETTER(second, tr + 2 * kr, tc - hstep); \ + switch (whichdir) { \ + case 0: \ + case 2: \ + CHECK_BETTER(second, tr + kr, tc + hstep); \ + break; \ + case 1: \ + case 3: \ + CHECK_BETTER(second, tr + kr, tc - hstep); \ + break; \ + } \ + } \ + } + int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x, int_mv *bestmv, int_mv *ref_mv, int error_per_bit, @@ -261,7 +326,6 @@ int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x, int rr, rc, br, bc, hstep; int tr, tc; unsigned int besterr = INT_MAX; - unsigned int left, right, up, down, diag; unsigned int sse; unsigned int whichdir; unsigned int halfiters = iters_per_step; @@ -306,32 +370,10 @@ int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x, // common with the last iteration could be 2 ( if diag selected) while (halfiters--) { // 1/2 pel - CHECK_BETTER(left, tr, tc - hstep); - CHECK_BETTER(right, tr, tc + hstep); - CHECK_BETTER(up, tr - hstep, tc); - CHECK_BETTER(down, tr + hstep, tc); - - whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); - - switch (whichdir) { - case 0: - CHECK_BETTER(diag, tr - hstep, tc - hstep); - break; - case 1: - CHECK_BETTER(diag, tr - hstep, tc + hstep); - break; - case 2: - CHECK_BETTER(diag, tr + hstep, tc - hstep); - break; - case 3: - CHECK_BETTER(diag, tr + hstep, tc + hstep); - break; - } - + FIRST_LEVEL_CHECKS; // no reason to check the same one again. if (tr == br && tc == bc) break; - tr = br; tc = bc; } @@ -343,32 +385,10 @@ int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x, if (forced_stop != 2) { hstep >>= 1; while (quarteriters--) { - CHECK_BETTER(left, tr, tc - hstep); - CHECK_BETTER(right, tr, tc + hstep); - CHECK_BETTER(up, tr - hstep, tc); - CHECK_BETTER(down, tr + hstep, tc); - - whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); - - switch (whichdir) { - case 0: - CHECK_BETTER(diag, tr - hstep, tc - hstep); - break; - case 1: - CHECK_BETTER(diag, tr - hstep, tc + hstep); - break; - case 2: - CHECK_BETTER(diag, tr + hstep, tc - hstep); - break; - case 3: - CHECK_BETTER(diag, tr + hstep, tc + hstep); - break; - } - + FIRST_LEVEL_CHECKS; // no reason to check the same one again. if (tr == br && tc == bc) break; - tr = br; tc = bc; } @@ -378,32 +398,10 @@ int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x, forced_stop == 0) { hstep >>= 1; while (eighthiters--) { - CHECK_BETTER(left, tr, tc - hstep); - CHECK_BETTER(right, tr, tc + hstep); - CHECK_BETTER(up, tr - hstep, tc); - CHECK_BETTER(down, tr + hstep, tc); - - whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); - - switch (whichdir) { - case 0: - CHECK_BETTER(diag, tr - hstep, tc - hstep); - break; - case 1: - CHECK_BETTER(diag, tr - hstep, tc + hstep); - break; - case 2: - CHECK_BETTER(diag, tr + hstep, tc - hstep); - break; - case 3: - CHECK_BETTER(diag, tr + hstep, tc + hstep); - break; - } - + FIRST_LEVEL_CHECKS; // no reason to check the same one again. if (tr == br && tc == bc) break; - tr = br; tc = bc; } @@ -419,6 +417,105 @@ int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x, return besterr; } +int vp9_find_best_sub_pixel_tree(MACROBLOCK *x, + int_mv *bestmv, int_mv *ref_mv, + int error_per_bit, + const vp9_variance_fn_ptr_t *vfp, + int forced_stop, + int iters_per_step, + int *mvjcost, int *mvcost[2], + int *distortion, + unsigned int *sse1) { + uint8_t *z = x->plane[0].src.buf; + int src_stride = x->plane[0].src.stride; + MACROBLOCKD *xd = &x->e_mbd; + int rr, rc, br, bc, hstep; + int tr, tc; + unsigned int besterr = INT_MAX; + unsigned int sse; + unsigned int whichdir; + int thismse; + int maxc, minc, maxr, minr; + int y_stride; + int offset; + unsigned int halfiters = iters_per_step; + unsigned int quarteriters = iters_per_step; + unsigned int eighthiters = iters_per_step; + + uint8_t *y = xd->plane[0].pre[0].buf + + (bestmv->as_mv.row) * xd->plane[0].pre[0].stride + + bestmv->as_mv.col; + + y_stride = xd->plane[0].pre[0].stride; + + rr = ref_mv->as_mv.row; + rc = ref_mv->as_mv.col; + br = bestmv->as_mv.row << 3; + bc = bestmv->as_mv.col << 3; + hstep = 4; + minc = MAX(x->mv_col_min << 3, + (ref_mv->as_mv.col) - ((1 << MV_MAX_BITS) - 1)); + maxc = MIN(x->mv_col_max << 3, + (ref_mv->as_mv.col) + ((1 << MV_MAX_BITS) - 1)); + minr = MAX(x->mv_row_min << 3, + (ref_mv->as_mv.row) - ((1 << MV_MAX_BITS) - 1)); + maxr = MIN(x->mv_row_max << 3, + (ref_mv->as_mv.row) + ((1 << MV_MAX_BITS) - 1)); + + tr = br; + tc = bc; + + offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col; + + // central mv + bestmv->as_mv.row <<= 3; + bestmv->as_mv.col <<= 3; + + // calculate central point error + besterr = vfp->vf(y, y_stride, z, src_stride, sse1); + *distortion = besterr; + besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); + + // 1/2 pel + FIRST_LEVEL_CHECKS; + if (halfiters > 1) { + SECOND_LEVEL_CHECKS; + } + tr = br; + tc = bc; + + // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only + if (forced_stop != 2) { + hstep >>= 1; + FIRST_LEVEL_CHECKS; + if (quarteriters > 1) { + SECOND_LEVEL_CHECKS; + } + tr = br; + tc = bc; + } + + if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) && + forced_stop == 0) { + hstep >>= 1; + FIRST_LEVEL_CHECKS; + if (eighthiters > 1) { + SECOND_LEVEL_CHECKS; + } + tr = br; + tc = bc; + } + + bestmv->as_mv.row = br; + bestmv->as_mv.col = bc; + + if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) || + (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3))) + return INT_MAX; + + return besterr; +} + #undef DIST /* returns subpixel variance error function */ #define DIST(r, c) \ @@ -443,7 +540,6 @@ int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x, int rr, rc, br, bc, hstep; int tr, tc; unsigned int besterr = INT_MAX; - unsigned int left, right, up, down, diag; unsigned int sse; unsigned int whichdir; unsigned int halfiters = iters_per_step; @@ -478,7 +574,6 @@ int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x, tr = br; tc = bc; - offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col; // central mv @@ -497,32 +592,10 @@ int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x, // common with the last iteration could be 2 ( if diag selected) while (halfiters--) { // 1/2 pel - CHECK_BETTER(left, tr, tc - hstep); - CHECK_BETTER(right, tr, tc + hstep); - CHECK_BETTER(up, tr - hstep, tc); - CHECK_BETTER(down, tr + hstep, tc); - - whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); - - switch (whichdir) { - case 0: - CHECK_BETTER(diag, tr - hstep, tc - hstep); - break; - case 1: - CHECK_BETTER(diag, tr - hstep, tc + hstep); - break; - case 2: - CHECK_BETTER(diag, tr + hstep, tc - hstep); - break; - case 3: - CHECK_BETTER(diag, tr + hstep, tc + hstep); - break; - } - + FIRST_LEVEL_CHECKS; // no reason to check the same one again. if (tr == br && tc == bc) break; - tr = br; tc = bc; } @@ -534,32 +607,10 @@ int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x, if (forced_stop != 2) { hstep >>= 1; while (quarteriters--) { - CHECK_BETTER(left, tr, tc - hstep); - CHECK_BETTER(right, tr, tc + hstep); - CHECK_BETTER(up, tr - hstep, tc); - CHECK_BETTER(down, tr + hstep, tc); - - whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); - - switch (whichdir) { - case 0: - CHECK_BETTER(diag, tr - hstep, tc - hstep); - break; - case 1: - CHECK_BETTER(diag, tr - hstep, tc + hstep); - break; - case 2: - CHECK_BETTER(diag, tr + hstep, tc - hstep); - break; - case 3: - CHECK_BETTER(diag, tr + hstep, tc + hstep); - break; - } - + FIRST_LEVEL_CHECKS; // no reason to check the same one again. if (tr == br && tc == bc) break; - tr = br; tc = bc; } @@ -569,32 +620,10 @@ int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x, forced_stop == 0) { hstep >>= 1; while (eighthiters--) { - CHECK_BETTER(left, tr, tc - hstep); - CHECK_BETTER(right, tr, tc + hstep); - CHECK_BETTER(up, tr - hstep, tc); - CHECK_BETTER(down, tr + hstep, tc); - - whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); - - switch (whichdir) { - case 0: - CHECK_BETTER(diag, tr - hstep, tc - hstep); - break; - case 1: - CHECK_BETTER(diag, tr - hstep, tc + hstep); - break; - case 2: - CHECK_BETTER(diag, tr + hstep, tc - hstep); - break; - case 3: - CHECK_BETTER(diag, tr + hstep, tc + hstep); - break; - } - + FIRST_LEVEL_CHECKS; // no reason to check the same one again. if (tr == br && tc == bc) break; - tr = br; tc = bc; } @@ -609,6 +638,116 @@ int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x, return besterr; } +int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x, + int_mv *bestmv, int_mv *ref_mv, + int error_per_bit, + const vp9_variance_fn_ptr_t *vfp, + int forced_stop, + int iters_per_step, + int *mvjcost, int *mvcost[2], + int *distortion, + unsigned int *sse1, + const uint8_t *second_pred, + int w, int h) { + uint8_t *z = x->plane[0].src.buf; + int src_stride = x->plane[0].src.stride; + MACROBLOCKD *xd = &x->e_mbd; + int rr, rc, br, bc, hstep; + int tr, tc; + unsigned int besterr = INT_MAX; + unsigned int sse; + unsigned int whichdir; + int thismse; + int maxc, minc, maxr, minr; + int y_stride; + int offset; + unsigned int halfiters = iters_per_step; + unsigned int quarteriters = iters_per_step; + unsigned int eighthiters = iters_per_step; + + DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64); + uint8_t *y = xd->plane[0].pre[0].buf + + (bestmv->as_mv.row) * xd->plane[0].pre[0].stride + + bestmv->as_mv.col; + + y_stride = xd->plane[0].pre[0].stride; + + rr = ref_mv->as_mv.row; + rc = ref_mv->as_mv.col; + br = bestmv->as_mv.row << 3; + bc = bestmv->as_mv.col << 3; + hstep = 4; + minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) - + ((1 << MV_MAX_BITS) - 1)); + maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) + + ((1 << MV_MAX_BITS) - 1)); + minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) - + ((1 << MV_MAX_BITS) - 1)); + maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) + + ((1 << MV_MAX_BITS) - 1)); + + tr = br; + tc = bc; + + + offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col; + + // central mv + bestmv->as_mv.row <<= 3; + bestmv->as_mv.col <<= 3; + + // calculate central point error + // TODO(yunqingwang): central pointer error was already calculated in full- + // pixel search, and can be passed in this function. + comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride); + besterr = vfp->vf(comp_pred, w, z, src_stride, sse1); + *distortion = besterr; + besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); + + // Each subsequent iteration checks at least one point in + // common with the last iteration could be 2 ( if diag selected) + // 1/2 pel + FIRST_LEVEL_CHECKS; + if (halfiters > 1) { + SECOND_LEVEL_CHECKS; + } + tr = br; + tc = bc; + + // Each subsequent iteration checks at least one point in common with + // the last iteration could be 2 ( if diag selected) 1/4 pel + + // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only + if (forced_stop != 2) { + hstep >>= 1; + FIRST_LEVEL_CHECKS; + if (quarteriters > 1) { + SECOND_LEVEL_CHECKS; + } + tr = br; + tc = bc; + } + + if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) && + forced_stop == 0) { + hstep >>= 1; + FIRST_LEVEL_CHECKS; + if (eighthiters > 1) { + SECOND_LEVEL_CHECKS; + } + tr = br; + tc = bc; + } + bestmv->as_mv.row = br; + bestmv->as_mv.col = bc; + + if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) || + (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3))) + return INT_MAX; + + return besterr; +} + #undef MVC #undef PRE #undef DIST diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h index b91e6fd..35ef483 100644 --- a/vp9/encoder/vp9_mcomp.h +++ b/vp9/encoder/vp9_mcomp.h @@ -80,6 +80,21 @@ typedef int (fractional_mv_step_fp) ( int *distortion, unsigned int *sse); extern fractional_mv_step_fp vp9_find_best_sub_pixel_iterative; +extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree; + +typedef int (fractional_mv_step_comp_fp) ( + MACROBLOCK *x, + int_mv *bestmv, int_mv *ref_mv, + int error_per_bit, + const vp9_variance_fn_ptr_t *vfp, + int forced_stop, // 0 - full, 1 - qtr only, 2 - half only + int iters_per_step, + int *mvjcost, int *mvcost[2], + int *distortion, unsigned int *sse1, + const uint8_t *second_pred, + int w, int h); +extern fractional_mv_step_comp_fp vp9_find_best_sub_pixel_comp_iterative; +extern fractional_mv_step_comp_fp vp9_find_best_sub_pixel_comp_tree; typedef int (*vp9_full_search_fn_t)(MACROBLOCK *x, int_mv *ref_mv, int sad_per_bit, @@ -102,18 +117,6 @@ typedef int (*vp9_diamond_search_fn_t)(MACROBLOCK *x, int *mvjcost, int *mvcost[2], int_mv *center_mv); -int vp9_find_best_sub_pixel_comp_iterative( - MACROBLOCK *x, - int_mv *bestmv, int_mv *ref_mv, - int error_per_bit, - const vp9_variance_fn_ptr_t *vfp, - int forced_stop, // 0 - full, 1 - qtr only, 2 - half only - int iters_per_step, - int *mvjcost, int *mvcost[2], - int *distortion, unsigned int *sse1, - const uint8_t *second_pred, - int w, int h); - int vp9_refining_search_8p_c(MACROBLOCK *x, int_mv *ref_mv, int error_per_bit, int search_range, vp9_variance_fn_ptr_t *fn_ptr, diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c index cf5ae52..8b74031 100644 --- a/vp9/encoder/vp9_onyx_if.c +++ b/vp9/encoder/vp9_onyx_if.c @@ -713,8 +713,8 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->search_method = NSTEP; sf->auto_filter = 1; sf->recode_loop = 1; - sf->subpel_search_method = SUBPEL_ITERATIVE; - sf->subpel_iters_per_step = 3; + sf->subpel_search_method = SUBPEL_TREE; + sf->subpel_iters_per_step = 2; sf->optimize_coefficients = !cpi->oxcf.lossless; sf->reduce_first_step_size = 0; sf->auto_mv_step_size = 0; @@ -830,7 +830,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) { (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0; sf->auto_mv_step_size = 1; sf->search_method = SQUARE; - sf->subpel_iters_per_step = 2; + sf->subpel_iters_per_step = 1; } if (speed == 3) { sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES; @@ -922,9 +922,10 @@ void vp9_set_speed_features(VP9_COMP *cpi) { if (cpi->sf.subpel_search_method == SUBPEL_ITERATIVE) { cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_iterative; - } else { - // TODO(debargha): Other methods to come - assert(0); + cpi->find_fractional_mv_step_comp = vp9_find_best_sub_pixel_comp_iterative; + } else if (cpi->sf.subpel_search_method == SUBPEL_TREE) { + cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree; + cpi->find_fractional_mv_step_comp = vp9_find_best_sub_pixel_comp_tree; } cpi->mb.optimize = cpi->sf.optimize_coefficients == 1 && cpi->pass != 1; diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h index 1249107..7fdfc24 100644 --- a/vp9/encoder/vp9_onyx_int.h +++ b/vp9/encoder/vp9_onyx_int.h @@ -234,6 +234,7 @@ typedef enum { typedef enum { SUBPEL_ITERATIVE = 0, + SUBPEL_TREE = 1, // Other methods to come } SUBPEL_SEARCH_METHODS; @@ -533,6 +534,7 @@ typedef struct VP9_COMP { unsigned int active_map_enabled; fractional_mv_step_fp *find_fractional_mv_step; + fractional_mv_step_comp_fp *find_fractional_mv_step_comp; vp9_full_search_fn_t full_search_sad; vp9_refining_search_fn_t refining_search_sad; vp9_diamond_search_fn_t diamond_search_sad; diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 90d35f8..4c23dfb 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -2675,7 +2675,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, int dis; /* TODO: use dis in distortion calculation later. */ unsigned int sse; - bestsme = vp9_find_best_sub_pixel_comp_iterative( + bestsme = cpi->find_fractional_mv_step_comp( x, &tmp_mv, &ref_mv[id], x->errorperbit, -- 2.7.4