From a0043c6d3027339eec6db415a88af975e0274bab Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Wed, 25 Mar 2015 14:19:29 -0700 Subject: [PATCH] Enhance the transform skipping decision-making in non-rd mode For large partition blocks(block_size > 32x32), the variance calculation is modified so that every 8x8 block's variance is stored during the calculation, which is used in the following transform skipping test. Also, the variance for every tx block is calculated. The skipping test checks all tx blocks in the partition, and sets the skip flag only if all tx blocks are skippable. If the skip flag of Y plane is 1, a quick evaluation is done on UV planes. If the current partition block is skippable in YUV planes, the mode search checks fewer inter modes and doesn't check intra modes. The rtc set borg test(at speed 6) showed that: Overall psnr: -0.527%; Avg psnr: -0.510%; ssim: -0.573%. Average single-thread speedup on rtc set was 3.5%. For 720p clips, more speedups were seen. gipsrecmotion: 13% gipsrestat: 12% vidyo: 5 - 9% dark: 15% niklas: 6% Change-Id: I8d8ebec0cb305f1de016516400bf007c3042666e --- vp9/encoder/vp9_pickmode.c | 266 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 264 insertions(+), 2 deletions(-) diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index f8a5e6a..8607162 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -202,6 +202,248 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x, return rv; } +static void block_variance(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int w, int h, unsigned int *sse, int *sum, + int block_size, unsigned int *sse8x8, + int *sum8x8, unsigned int *var8x8) { + int i, j, k = 0; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + vp9_get8x8var(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, + &sse8x8[k], &sum8x8[k]); + *sse += sse8x8[k]; + *sum += sum8x8[k]; + var8x8[k] = sse8x8[k] - (((unsigned int)sum8x8[k] * sum8x8[k]) >> 6); + k++; + } + } +} + +static void calculate_variance(int bw, int bh, TX_SIZE tx_size, + unsigned int *sse_i, int *sum_i, + unsigned int *var_o, unsigned int *sse_o, + int *sum_o) { + const BLOCK_SIZE unit_size = txsize_to_bsize[tx_size]; + const int nw = 1 << (bw - b_width_log2_lookup[unit_size]); + const int nh = 1 << (bh - b_height_log2_lookup[unit_size]); + int i, j, k = 0; + + for (i = 0; i < nh; i += 2) { + for (j = 0; j < nw; j += 2) { + sse_o[k] = sse_i[i * nw + j] + sse_i[i * nw + j + 1] + + sse_i[(i + 1) * nw + j] + sse_i[(i + 1) * nw + j + 1]; + sum_o[k] = sum_i[i * nw + j] + sum_i[i * nw + j + 1] + + sum_i[(i + 1) * nw + j] + sum_i[(i + 1) * nw + j + 1]; + var_o[k] = sse_o[k] - (((unsigned int)sum_o[k] * sum_o[k]) >> + (b_width_log2_lookup[unit_size] + + b_height_log2_lookup[unit_size] + 6)); + k++; + } + } +} + +static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, + MACROBLOCK *x, MACROBLOCKD *xd, + int *out_rate_sum, int64_t *out_dist_sum, + unsigned int *var_y, unsigned int *sse_y, + int mi_row, int mi_col, int *early_term) { + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + unsigned int sse; + int rate; + int64_t dist; + struct macroblock_plane *const p = &x->plane[0]; + struct macroblockd_plane *const pd = &xd->plane[0]; + const uint32_t dc_quant = pd->dequant[0]; + const uint32_t ac_quant = pd->dequant[1]; + const int64_t dc_thr = dc_quant * dc_quant >> 6; + const int64_t ac_thr = ac_quant * ac_quant >> 6; + unsigned int var; + int sum; + int skip_dc = 0; + + const int bw = b_width_log2_lookup[bsize]; + const int bh = b_height_log2_lookup[bsize]; + const int num8x8 = 1 << (bw + bh - 2); + unsigned int sse8x8[64] = {0}; + int sum8x8[64] = {0}; + unsigned int var8x8[64] = {0}; + TX_SIZE tx_size; + int i, k; + + // Calculate variance for whole partition, and also save 8x8 blocks' variance + // to be used in following transform skipping test. + block_variance(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, + 4 << bw, 4 << bh, &sse, &sum, 8, sse8x8, sum8x8, var8x8); + var = sse - (((int64_t)sum * sum) >> (bw + bh + 4)); + + *var_y = var; + *sse_y = sse; + + if (cpi->common.tx_mode == TX_MODE_SELECT) { + if (sse > (var << 2)) + tx_size = MIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); + else + tx_size = TX_8X8; + + if (cpi->sf.partition_search_type == VAR_BASED_PARTITION) { + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && + cyclic_refresh_segment_id_boosted(xd->mi[0].src_mi->mbmi.segment_id)) + tx_size = TX_8X8; + else if (tx_size > TX_16X16) + tx_size = TX_16X16; + } + } else { + tx_size = MIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); + } + + assert(tx_size >= TX_8X8); + xd->mi[0].src_mi->mbmi.tx_size = tx_size; + + // Evaluate if the partition block is a skippable block in Y plane. + { + unsigned int sse16x16[16] = {0}; + int sum16x16[16] = {0}; + unsigned int var16x16[16] = {0}; + const int num16x16 = num8x8 >> 2; + + unsigned int sse32x32[4] = {0}; + int sum32x32[4] = {0}; + unsigned int var32x32[4] = {0}; + const int num32x32 = num8x8 >> 4; + + int ac_test = 1; + int dc_test = 1; + const int num = (tx_size == TX_8X8) ? num8x8 : + ((tx_size == TX_16X16) ? num16x16 : num32x32); + const unsigned int *sse_tx = (tx_size == TX_8X8) ? sse8x8 : + ((tx_size == TX_16X16) ? sse16x16 : sse32x32); + const unsigned int *var_tx = (tx_size == TX_8X8) ? var8x8 : + ((tx_size == TX_16X16) ? var16x16 : var32x32); + + // Calculate variance if tx_size > TX_8X8 + if (tx_size >= TX_16X16) + calculate_variance(bw, bh, TX_8X8, sse8x8, sum8x8, var16x16, sse16x16, + sum16x16); + if (tx_size == TX_32X32) + calculate_variance(bw, bh, TX_16X16, sse16x16, sum16x16, var32x32, + sse32x32, sum32x32); + + // Skipping test + x->skip_txfm[0] = 0; + for (k = 0; k < num; k++) + // Check if all ac coefficients can be quantized to zero. + if (!(var_tx[k] < ac_thr || var == 0)) { + ac_test = 0; + break; + } + + for (k = 0; k < num; k++) + // Check if dc coefficient can be quantized to zero. + if (!(sse_tx[k] - var_tx[k] < dc_thr || sse == var)) { + dc_test = 0; + break; + } + + if (ac_test) { + x->skip_txfm[0] = 2; + + if (dc_test) + x->skip_txfm[0] = 1; + } else if (dc_test) { + skip_dc = 1; + } + } + + if (x->skip_txfm[0] == 1) { + int skip_uv[2] = {0}; + unsigned int var_uv[2]; + unsigned int sse_uv[2]; + + *out_rate_sum = 0; + *out_dist_sum = sse << 4; + + // Transform skipping test in UV planes. + for (i = 1; i <= 2; i++) { + struct macroblock_plane *const p = &x->plane[i]; + struct macroblockd_plane *const pd = &xd->plane[i]; + const TX_SIZE uv_tx_size = get_uv_tx_size(&xd->mi[0].src_mi->mbmi, pd); + const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size]; + const int sf = (bw - b_width_log2_lookup[unit_size]) + + (bh - b_height_log2_lookup[unit_size]); + const BLOCK_SIZE bs = get_plane_block_size(bsize, pd); + const uint32_t uv_dc_thr = pd->dequant[0] * pd->dequant[0] >> (6 - sf); + const uint32_t uv_ac_thr = pd->dequant[1] * pd->dequant[1] >> (6 - sf); + int j = i - 1; + + vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i); + var_uv[j] = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride, &sse_uv[j]); + + if (var_uv[j] < uv_ac_thr || var_uv[j] == 0) { + if (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]) + skip_uv[j] = 1; + } + } + + // If the transform in YUV planes are skippable, the mode search checks + // fewer inter modes and doesn't check intra modes. + if (skip_uv[0] & skip_uv[1]) { + *early_term = 1; + } + + return; + } + + if (!skip_dc) { +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize], + dc_quant >> (xd->bd - 5), &rate, &dist); + } else { + vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize], + dc_quant >> 3, &rate, &dist); + } +#else + vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize], + dc_quant >> 3, &rate, &dist); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + + if (!skip_dc) { + *out_rate_sum = rate >> 1; + *out_dist_sum = dist << 3; + } else { + *out_rate_sum = 0; + *out_dist_sum = (sse - var) << 4; + } + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize], + ac_quant >> (xd->bd - 5), &rate, &dist); + } else { + vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize], + ac_quant >> 3, &rate, &dist); + } +#else + vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize], + ac_quant >> 3, &rate, &dist); +#endif // CONFIG_VP9_HIGHBITDEPTH + + *out_rate_sum += rate; + *out_dist_sum += dist << 4; +} + static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, int *out_rate_sum, int64_t *out_dist_sum, @@ -799,6 +1041,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int ref_frame_skip_mask = 0; int idx; int best_pred_sad = INT_MAX; + int best_early_term = 0; int ref_frame_cost[MAX_REF_FRAMES]; vp9_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd); vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd); @@ -906,6 +1149,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, PREDICTION_MODE this_mode = ref_mode_set[idx].pred_mode; int64_t this_sse; int is_skippable; + int this_early_term = 0; if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode))) continue; @@ -1066,8 +1310,18 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } else { mbmi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP : filter_ref; vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); - model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist, - &var_y, &sse_y); + + // For large partition blocks, extra testing is done. + if (bsize > BLOCK_32X32 && xd->mi[0].src_mi->mbmi.segment_id != 1 && + cm->base_qindex) { + model_rd_for_sb_y_large(cpi, bsize, x, xd, &this_rdc.rate, + &this_rdc.dist, &var_y, &sse_y, mi_row, mi_col, + &this_early_term); + } else { + model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist, + &var_y, &sse_y); + } + this_rdc.rate += cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi, xd) : 0; @@ -1143,6 +1397,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, best_tx_size = mbmi->tx_size; best_ref_frame = ref_frame; best_mode_skip_txfm = x->skip_txfm[0]; + best_early_term = this_early_term; if (reuse_inter_pred) { free_pred_buffer(best_pred); @@ -1155,6 +1410,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (x->skip) break; + + // If early termination flag is 1 and at least 2 modes are checked, + // the mode search is terminated. + if (best_early_term && idx > 0) { + x->skip = 1; + break; + } } mbmi->mode = best_mode; -- 2.7.4