From: Jerome Jiang Date: Tue, 14 Mar 2017 20:45:20 +0000 (-0700) Subject: vp9: Enable adaptive_rd_threshold for row mt for realtime speed 8. X-Git-Tag: v1.7.0~603^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=20c2892693b3be46cb53b8c9262d41d7a61999aa;p=platform%2Fupstream%2Flibvpx.git vp9: Enable adaptive_rd_threshold for row mt for realtime speed 8. Change it to row based array to avoid the slow down cause by sync. row-mt on, speed 8, 2 threads: ~4% speedup for VGA on ARM benefited from adaptive_rd_threshold. Change-Id: I887e65a53af20a6c4f48d293daaee09dab3512cf --- diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 103680f..613a7f7 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -4285,13 +4285,14 @@ void vp9_init_tile_data(VP9_COMP *cpi) { int i, j; for (i = 0; i < BLOCK_SIZES; ++i) { for (j = 0; j < MAX_MODES; ++j) { - tile_data->thresh_freq_fact[i][j] = 32; + tile_data->thresh_freq_fact[i][j] = RD_THRESH_INIT_FACT; tile_data->mode_map[i][j] = j; } } #if CONFIG_MULTITHREAD tile_data->search_count_mutex = NULL; tile_data->enc_row_mt_mutex = NULL; + tile_data->row_base_thresh_freq_fact = NULL; #endif } } diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 72c0ba7..346a6d8 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -284,6 +284,9 @@ typedef struct TileDataEnc { int ex_search_count; FIRSTPASS_DATA fp_data; VP9RowMTSync row_mt_sync; + + // Used for adaptive_rd_thresh with row multithreading + int *row_base_thresh_freq_fact; #if CONFIG_MULTITHREAD pthread_mutex_t *search_count_mutex; pthread_mutex_t *enc_row_mt_mutex; diff --git a/vp9/encoder/vp9_multi_thread.c b/vp9/encoder/vp9_multi_thread.c index e27b1ed..f5d8e43 100644 --- a/vp9/encoder/vp9_multi_thread.c +++ b/vp9/encoder/vp9_multi_thread.c @@ -82,6 +82,16 @@ void vp9_row_mt_mem_alloc(VP9_COMP *cpi) { for (tile_col = 0; tile_col < tile_cols; tile_col++) { TileDataEnc *this_tile = &cpi->tile_data[tile_col]; vp9_row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, jobs_per_tile_col); + if (cpi->sf.adaptive_rd_thresh_row_mt) { + const int sb_rows = + (mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2) + 1; + int i; + this_tile->row_base_thresh_freq_fact = + (int *)vpx_calloc(sb_rows * BLOCK_SIZES * MAX_MODES, + sizeof(*(this_tile->row_base_thresh_freq_fact))); + for (i = 0; i < sb_rows * BLOCK_SIZES * MAX_MODES; i++) + this_tile->row_base_thresh_freq_fact[i] = RD_THRESH_INIT_FACT; + } } // Assign the sync pointer of tile row zero for every tile row > 0 @@ -154,10 +164,15 @@ void vp9_row_mt_mem_dealloc(VP9_COMP *cpi) { TileDataEnc *this_tile = &cpi->tile_data[tile_row * multi_thread_ctxt->allocated_tile_cols + tile_col]; + if (cpi->sf.adaptive_rd_thresh_row_mt) { + if (this_tile->row_base_thresh_freq_fact != NULL) { + vpx_free(this_tile->row_base_thresh_freq_fact); + this_tile->row_base_thresh_freq_fact = NULL; + } + } pthread_mutex_destroy(this_tile->search_count_mutex); vpx_free(this_tile->search_count_mutex); this_tile->search_count_mutex = NULL; - pthread_mutex_destroy(this_tile->enc_row_mt_mutex); vpx_free(this_tile->enc_row_mt_mutex); this_tile->enc_row_mt_mutex = NULL; diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index ecc9afb..f34facf 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -1016,6 +1016,32 @@ static int mode_offset(const PREDICTION_MODE mode) { } } +static INLINE int rd_less_than_thresh_row_mt(int64_t best_rd, int thresh, + const int *const thresh_fact) { + int is_rd_less_than_thresh; + is_rd_less_than_thresh = + best_rd < ((int64_t)thresh * (*thresh_fact) >> 5) || thresh == INT_MAX; + return is_rd_less_than_thresh; +} + +static INLINE void update_thresh_freq_fact_row_mt( + VP9_COMP *cpi, TileDataEnc *tile_data, int source_variance, + int thresh_freq_fact_idx, MV_REFERENCE_FRAME ref_frame, + THR_MODES best_mode_idx, PREDICTION_MODE mode) { + THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)]; + int freq_fact_idx = thresh_freq_fact_idx + thr_mode_idx; + int *freq_fact = &tile_data->row_base_thresh_freq_fact[freq_fact_idx]; + if (thr_mode_idx == best_mode_idx) + *freq_fact -= (*freq_fact >> 4); + else if (cpi->sf.limit_newmv_early_exit && mode == NEWMV && + ref_frame == LAST_FRAME && source_variance < 5) { + *freq_fact = VPXMIN(*freq_fact + RD_THRESH_INC, 32); + } else { + *freq_fact = VPXMIN(*freq_fact + RD_THRESH_INC, + cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT); + } +} + static INLINE void update_thresh_freq_fact( VP9_COMP *cpi, TileDataEnc *tile_data, int source_variance, BLOCK_SIZE bsize, MV_REFERENCE_FRAME ref_frame, THR_MODES best_mode_idx, @@ -1398,7 +1424,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv, intra_cost_penalty, 0); const int *const rd_threshes = cpi->rd.threshes[mi->segment_id][bsize]; - const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize]; + const int sb_row = mi_row >> MI_BLOCK_SIZE_LOG2; + int thresh_freq_fact_idx = (sb_row * BLOCK_SIZES + bsize) * MAX_MODES; + const int *const rd_thresh_freq_fact = + (cpi->sf.adaptive_rd_thresh_row_mt) + ? &(tile_data->row_base_thresh_freq_fact[thresh_freq_fact_idx]) + : tile_data->thresh_freq_fact[bsize]; + INTERP_FILTER filter_ref; const int bsl = mi_width_log2_lookup[bsize]; const int pred_filter_search = @@ -1687,14 +1719,19 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, cpi->rc.frames_since_golden > 4) mode_rd_thresh = mode_rd_thresh << 3; - if (rd_less_than_thresh( - best_rdc.rdcost, mode_rd_thresh, + if ((cpi->sf.adaptive_rd_thresh_row_mt && + rd_less_than_thresh_row_mt(best_rdc.rdcost, mode_rd_thresh, + &rd_thresh_freq_fact[mode_index])) || + (!cpi->sf.adaptive_rd_thresh_row_mt && + rd_less_than_thresh( + best_rdc.rdcost, mode_rd_thresh, #if CONFIG_MULTITHREAD - // Synchronization of this function is only necessary when - // adaptive_rd_thresh is > 0. - cpi->sf.adaptive_rd_thresh ? tile_data->enc_row_mt_mutex : NULL, + // Synchronization of this function + // is only necessary when + // adaptive_rd_thresh is > 0. + cpi->sf.adaptive_rd_thresh ? tile_data->enc_row_mt_mutex : NULL, #endif - &rd_thresh_freq_fact[mode_index])) + &rd_thresh_freq_fact[mode_index]))) continue; if (this_mode == NEWMV) { @@ -2053,14 +2090,19 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (!((1 << this_mode) & cpi->sf.intra_y_mode_bsize_mask[bsize])) continue; - if (rd_less_than_thresh( - best_rdc.rdcost, mode_rd_thresh, + if ((cpi->sf.adaptive_rd_thresh_row_mt && + rd_less_than_thresh_row_mt(best_rdc.rdcost, mode_rd_thresh, + &rd_thresh_freq_fact[mode_index])) || + (!cpi->sf.adaptive_rd_thresh_row_mt && + rd_less_than_thresh( + best_rdc.rdcost, mode_rd_thresh, #if CONFIG_MULTITHREAD - // Synchronization of this function is only necessary when - // adaptive_rd_thresh is > 0. - cpi->sf.adaptive_rd_thresh ? tile_data->enc_row_mt_mutex : NULL, + // Synchronization of this function + // is only necessary when + // adaptive_rd_thresh is > 0. + cpi->sf.adaptive_rd_thresh ? tile_data->enc_row_mt_mutex : NULL, #endif - &rd_thresh_freq_fact[mode_index])) + &rd_thresh_freq_fact[mode_index]))) continue; mi->mode = this_mode; @@ -2168,16 +2210,27 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // TODO(yunqingwang): Check intra mode mask and only update freq_fact // for those valid modes. for (i = 0; i < intra_modes; i++) { - update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize, - INTRA_FRAME, best_mode_idx, intra_mode_list[i]); + if (cpi->sf.adaptive_rd_thresh_row_mt) + update_thresh_freq_fact_row_mt(cpi, tile_data, x->source_variance, + thresh_freq_fact_idx, INTRA_FRAME, + best_mode_idx, intra_mode_list[i]); + else + update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize, + INTRA_FRAME, best_mode_idx, + intra_mode_list[i]); } } else { for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) { PREDICTION_MODE this_mode; if (best_ref_frame != ref_frame) continue; for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) { - update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize, - ref_frame, best_mode_idx, this_mode); + if (cpi->sf.adaptive_rd_thresh_row_mt) + update_thresh_freq_fact_row_mt(cpi, tile_data, x->source_variance, + thresh_freq_fact_idx, ref_frame, + best_mode_idx, this_mode); + else + update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize, + ref_frame, best_mode_idx, this_mode); } } } diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h index 9daa0c3..aae47dc 100644 --- a/vp9/encoder/vp9_rd.h +++ b/vp9/encoder/vp9_rd.h @@ -38,6 +38,7 @@ extern "C" { #define MAX_MODES 30 #define MAX_REFS 6 +#define RD_THRESH_INIT_FACT 32 #define RD_THRESH_MAX_FACT 64 #define RD_THRESH_INC 1 diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index d796187..a3efc1e 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -323,6 +323,7 @@ static void set_rt_speed_feature_framesize_independent( const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key; sf->static_segmentation = 0; sf->adaptive_rd_thresh = 1; + sf->adaptive_rd_thresh_row_mt = 0; sf->use_fast_coef_costing = 1; sf->allow_exhaustive_searches = 0; sf->exhaustive_searches_thresh = INT_MAX; @@ -551,6 +552,9 @@ static void set_rt_speed_feature_framesize_independent( } } + if (cpi->row_mt && cpi->oxcf.max_threads > 1) + sf->adaptive_rd_thresh_row_mt = 1; + sf->mv.subpel_force_stop = (content == VP9E_CONTENT_SCREEN) ? 3 : 2; if (content == VP9E_CONTENT_SCREEN) sf->lpf_pick = LPF_PICK_MINIMAL_LPF; // Only keep INTRA_DC mode for speed 8. @@ -578,11 +582,10 @@ static void set_rt_speed_feature_framesize_independent( sf->limit_newmv_early_exit = 0; if (cm->width > 640 && cm->height > 480) sf->use_simple_block_yrd = 1; } - // Turn off adaptive_rd_thresh if row_mt is on for all the non-rd paths. This - // causes too many locks in realtime mode in certain platforms (Android ARM, - // Mac). - if (speed >= 5 && cpi->row_mt && cpi->num_workers > 1) { + // Turn off adaptive_rd_thresh if row_mt is on for speed 5, 6, 7. + if (speed >= 5 && speed < 8 && cpi->row_mt && cpi->num_workers > 1) { sf->adaptive_rd_thresh = 0; + sf->adaptive_rd_thresh_row_mt = 0; } } diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index 96898ee..cbdf8bc 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -233,6 +233,9 @@ typedef struct SPEED_FEATURES { // mode to be evaluated. A high value means we will be faster. int adaptive_rd_thresh; + // Flag to use adaptive_rd_thresh when row-mt it enabled. + int adaptive_rd_thresh_row_mt; + // Enables skipping the reconstruction step (idct, recon) in the // intermediate steps assuming the last frame didn't have too many intra // blocks and the q is less than a threshold.