vp9: Enable adaptive_rd_threshold for row mt for realtime speed 8.
authorJerome Jiang <jianj@google.com>
Tue, 14 Mar 2017 20:45:20 +0000 (13:45 -0700)
committerJerome Jiang <jianj@google.com>
Wed, 22 Mar 2017 01:49:47 +0000 (18:49 -0700)
Change it to row based array to avoid the slow down cause by sync.
row-mt on, speed 8, 2 threads: ~4% speedup for VGA on ARM benefited
from adaptive_rd_threshold.

Change-Id: I887e65a53af20a6c4f48d293daaee09dab3512cf

vp9/encoder/vp9_encodeframe.c
vp9/encoder/vp9_encoder.h
vp9/encoder/vp9_multi_thread.c
vp9/encoder/vp9_pickmode.c
vp9/encoder/vp9_rd.h
vp9/encoder/vp9_speed_features.c
vp9/encoder/vp9_speed_features.h

index 103680f..613a7f7 100644 (file)
@@ -4285,13 +4285,14 @@ void vp9_init_tile_data(VP9_COMP *cpi) {
         int i, j;
         for (i = 0; i < BLOCK_SIZES; ++i) {
           for (j = 0; j < MAX_MODES; ++j) {
-            tile_data->thresh_freq_fact[i][j] = 32;
+            tile_data->thresh_freq_fact[i][j] = RD_THRESH_INIT_FACT;
             tile_data->mode_map[i][j] = j;
           }
         }
 #if CONFIG_MULTITHREAD
         tile_data->search_count_mutex = NULL;
         tile_data->enc_row_mt_mutex = NULL;
+        tile_data->row_base_thresh_freq_fact = NULL;
 #endif
       }
   }
index 72c0ba7..346a6d8 100644 (file)
@@ -284,6 +284,9 @@ typedef struct TileDataEnc {
   int ex_search_count;
   FIRSTPASS_DATA fp_data;
   VP9RowMTSync row_mt_sync;
+
+  // Used for adaptive_rd_thresh with row multithreading
+  int *row_base_thresh_freq_fact;
 #if CONFIG_MULTITHREAD
   pthread_mutex_t *search_count_mutex;
   pthread_mutex_t *enc_row_mt_mutex;
index e27b1ed..f5d8e43 100644 (file)
@@ -82,6 +82,16 @@ void vp9_row_mt_mem_alloc(VP9_COMP *cpi) {
   for (tile_col = 0; tile_col < tile_cols; tile_col++) {
     TileDataEnc *this_tile = &cpi->tile_data[tile_col];
     vp9_row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, jobs_per_tile_col);
+    if (cpi->sf.adaptive_rd_thresh_row_mt) {
+      const int sb_rows =
+          (mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2) + 1;
+      int i;
+      this_tile->row_base_thresh_freq_fact =
+          (int *)vpx_calloc(sb_rows * BLOCK_SIZES * MAX_MODES,
+                            sizeof(*(this_tile->row_base_thresh_freq_fact)));
+      for (i = 0; i < sb_rows * BLOCK_SIZES * MAX_MODES; i++)
+        this_tile->row_base_thresh_freq_fact[i] = RD_THRESH_INIT_FACT;
+    }
   }
 
   // Assign the sync pointer of tile row zero for every tile row > 0
@@ -154,10 +164,15 @@ void vp9_row_mt_mem_dealloc(VP9_COMP *cpi) {
       TileDataEnc *this_tile =
           &cpi->tile_data[tile_row * multi_thread_ctxt->allocated_tile_cols +
                           tile_col];
+      if (cpi->sf.adaptive_rd_thresh_row_mt) {
+        if (this_tile->row_base_thresh_freq_fact != NULL) {
+          vpx_free(this_tile->row_base_thresh_freq_fact);
+          this_tile->row_base_thresh_freq_fact = NULL;
+        }
+      }
       pthread_mutex_destroy(this_tile->search_count_mutex);
       vpx_free(this_tile->search_count_mutex);
       this_tile->search_count_mutex = NULL;
-
       pthread_mutex_destroy(this_tile->enc_row_mt_mutex);
       vpx_free(this_tile->enc_row_mt_mutex);
       this_tile->enc_row_mt_mutex = NULL;
index ecc9afb..f34facf 100644 (file)
@@ -1016,6 +1016,32 @@ static int mode_offset(const PREDICTION_MODE mode) {
   }
 }
 
+static INLINE int rd_less_than_thresh_row_mt(int64_t best_rd, int thresh,
+                                             const int *const thresh_fact) {
+  int is_rd_less_than_thresh;
+  is_rd_less_than_thresh =
+      best_rd < ((int64_t)thresh * (*thresh_fact) >> 5) || thresh == INT_MAX;
+  return is_rd_less_than_thresh;
+}
+
+static INLINE void update_thresh_freq_fact_row_mt(
+    VP9_COMP *cpi, TileDataEnc *tile_data, int source_variance,
+    int thresh_freq_fact_idx, MV_REFERENCE_FRAME ref_frame,
+    THR_MODES best_mode_idx, PREDICTION_MODE mode) {
+  THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)];
+  int freq_fact_idx = thresh_freq_fact_idx + thr_mode_idx;
+  int *freq_fact = &tile_data->row_base_thresh_freq_fact[freq_fact_idx];
+  if (thr_mode_idx == best_mode_idx)
+    *freq_fact -= (*freq_fact >> 4);
+  else if (cpi->sf.limit_newmv_early_exit && mode == NEWMV &&
+           ref_frame == LAST_FRAME && source_variance < 5) {
+    *freq_fact = VPXMIN(*freq_fact + RD_THRESH_INC, 32);
+  } else {
+    *freq_fact = VPXMIN(*freq_fact + RD_THRESH_INC,
+                        cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+  }
+}
+
 static INLINE void update_thresh_freq_fact(
     VP9_COMP *cpi, TileDataEnc *tile_data, int source_variance,
     BLOCK_SIZE bsize, MV_REFERENCE_FRAME ref_frame, THR_MODES best_mode_idx,
@@ -1398,7 +1424,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   int64_t inter_mode_thresh =
       RDCOST(x->rdmult, x->rddiv, intra_cost_penalty, 0);
   const int *const rd_threshes = cpi->rd.threshes[mi->segment_id][bsize];
-  const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
+  const int sb_row = mi_row >> MI_BLOCK_SIZE_LOG2;
+  int thresh_freq_fact_idx = (sb_row * BLOCK_SIZES + bsize) * MAX_MODES;
+  const int *const rd_thresh_freq_fact =
+      (cpi->sf.adaptive_rd_thresh_row_mt)
+          ? &(tile_data->row_base_thresh_freq_fact[thresh_freq_fact_idx])
+          : tile_data->thresh_freq_fact[bsize];
+
   INTERP_FILTER filter_ref;
   const int bsl = mi_width_log2_lookup[bsize];
   const int pred_filter_search =
@@ -1687,14 +1719,19 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
         cpi->rc.frames_since_golden > 4)
       mode_rd_thresh = mode_rd_thresh << 3;
 
-    if (rd_less_than_thresh(
-            best_rdc.rdcost, mode_rd_thresh,
+    if ((cpi->sf.adaptive_rd_thresh_row_mt &&
+         rd_less_than_thresh_row_mt(best_rdc.rdcost, mode_rd_thresh,
+                                    &rd_thresh_freq_fact[mode_index])) ||
+        (!cpi->sf.adaptive_rd_thresh_row_mt &&
+         rd_less_than_thresh(
+             best_rdc.rdcost, mode_rd_thresh,
 #if CONFIG_MULTITHREAD
-            // Synchronization of this function is only necessary when
-            // adaptive_rd_thresh is > 0.
-            cpi->sf.adaptive_rd_thresh ? tile_data->enc_row_mt_mutex : NULL,
+             // Synchronization of this function
+             // is only necessary when
+             // adaptive_rd_thresh is > 0.
+             cpi->sf.adaptive_rd_thresh ? tile_data->enc_row_mt_mutex : NULL,
 #endif
-            &rd_thresh_freq_fact[mode_index]))
+             &rd_thresh_freq_fact[mode_index])))
       continue;
 
     if (this_mode == NEWMV) {
@@ -2053,14 +2090,19 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       if (!((1 << this_mode) & cpi->sf.intra_y_mode_bsize_mask[bsize]))
         continue;
 
-      if (rd_less_than_thresh(
-              best_rdc.rdcost, mode_rd_thresh,
+      if ((cpi->sf.adaptive_rd_thresh_row_mt &&
+           rd_less_than_thresh_row_mt(best_rdc.rdcost, mode_rd_thresh,
+                                      &rd_thresh_freq_fact[mode_index])) ||
+          (!cpi->sf.adaptive_rd_thresh_row_mt &&
+           rd_less_than_thresh(
+               best_rdc.rdcost, mode_rd_thresh,
 #if CONFIG_MULTITHREAD
-              // Synchronization of this function is only necessary when
-              // adaptive_rd_thresh is > 0.
-              cpi->sf.adaptive_rd_thresh ? tile_data->enc_row_mt_mutex : NULL,
+               // Synchronization of this function
+               // is only necessary when
+               // adaptive_rd_thresh is > 0.
+               cpi->sf.adaptive_rd_thresh ? tile_data->enc_row_mt_mutex : NULL,
 #endif
-              &rd_thresh_freq_fact[mode_index]))
+               &rd_thresh_freq_fact[mode_index])))
         continue;
 
       mi->mode = this_mode;
@@ -2168,16 +2210,27 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       // TODO(yunqingwang): Check intra mode mask and only update freq_fact
       // for those valid modes.
       for (i = 0; i < intra_modes; i++) {
-        update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize,
-                                INTRA_FRAME, best_mode_idx, intra_mode_list[i]);
+        if (cpi->sf.adaptive_rd_thresh_row_mt)
+          update_thresh_freq_fact_row_mt(cpi, tile_data, x->source_variance,
+                                         thresh_freq_fact_idx, INTRA_FRAME,
+                                         best_mode_idx, intra_mode_list[i]);
+        else
+          update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize,
+                                  INTRA_FRAME, best_mode_idx,
+                                  intra_mode_list[i]);
       }
     } else {
       for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
         PREDICTION_MODE this_mode;
         if (best_ref_frame != ref_frame) continue;
         for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
-          update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize,
-                                  ref_frame, best_mode_idx, this_mode);
+          if (cpi->sf.adaptive_rd_thresh_row_mt)
+            update_thresh_freq_fact_row_mt(cpi, tile_data, x->source_variance,
+                                           thresh_freq_fact_idx, ref_frame,
+                                           best_mode_idx, this_mode);
+          else
+            update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize,
+                                    ref_frame, best_mode_idx, this_mode);
         }
       }
     }
index 9daa0c3..aae47dc 100644 (file)
@@ -38,6 +38,7 @@ extern "C" {
 #define MAX_MODES 30
 #define MAX_REFS 6
 
+#define RD_THRESH_INIT_FACT 32
 #define RD_THRESH_MAX_FACT 64
 #define RD_THRESH_INC 1
 
index d796187..a3efc1e 100644 (file)
@@ -323,6 +323,7 @@ static void set_rt_speed_feature_framesize_independent(
   const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key;
   sf->static_segmentation = 0;
   sf->adaptive_rd_thresh = 1;
+  sf->adaptive_rd_thresh_row_mt = 0;
   sf->use_fast_coef_costing = 1;
   sf->allow_exhaustive_searches = 0;
   sf->exhaustive_searches_thresh = INT_MAX;
@@ -551,6 +552,9 @@ static void set_rt_speed_feature_framesize_independent(
       }
     }
 
+    if (cpi->row_mt && cpi->oxcf.max_threads > 1)
+      sf->adaptive_rd_thresh_row_mt = 1;
+
     sf->mv.subpel_force_stop = (content == VP9E_CONTENT_SCREEN) ? 3 : 2;
     if (content == VP9E_CONTENT_SCREEN) sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
     // Only keep INTRA_DC mode for speed 8.
@@ -578,11 +582,10 @@ static void set_rt_speed_feature_framesize_independent(
     sf->limit_newmv_early_exit = 0;
     if (cm->width > 640 && cm->height > 480) sf->use_simple_block_yrd = 1;
   }
-  // Turn off adaptive_rd_thresh if row_mt is on for all the non-rd paths. This
-  // causes too many locks in realtime mode in certain platforms (Android ARM,
-  // Mac).
-  if (speed >= 5 && cpi->row_mt && cpi->num_workers > 1) {
+  // Turn off adaptive_rd_thresh if row_mt is on for speed 5, 6, 7.
+  if (speed >= 5 && speed < 8 && cpi->row_mt && cpi->num_workers > 1) {
     sf->adaptive_rd_thresh = 0;
+    sf->adaptive_rd_thresh_row_mt = 0;
   }
 }
 
index 96898ee..cbdf8bc 100644 (file)
@@ -233,6 +233,9 @@ typedef struct SPEED_FEATURES {
   // mode to be evaluated. A high value means we will be faster.
   int adaptive_rd_thresh;
 
+  // Flag to use adaptive_rd_thresh when row-mt it enabled.
+  int adaptive_rd_thresh_row_mt;
+
   // Enables skipping the reconstruction step (idct, recon) in the
   // intermediate steps assuming the last frame didn't have too many intra
   // blocks and the q is less than a threshold.