From 238cf66eb5e376062ee13198ef537e848866b8aa Mon Sep 17 00:00:00 2001 From: Hui Su Date: Tue, 12 Jun 2018 11:56:09 -0700 Subject: [PATCH] Improve the partition search breakout speed feature Use a linear model to make partition search breakout decisions. Currently the model is tuned for large quantizers and small resolutions. So it is only used when q-index is larger than 200 and frame width/height is smaller than 720. Also it's not yet supported for high bit depth. Tested speed 1 and 2 on lowres and midres. Compression performance is neutral. At low bitrates, encoding speedup is up to 50% for speed 1; up to 30% for speed 2. Some sample numbers: into_tree_480p, speed 1 QP=60 before: 35.228 dB, 3488b/f, 7.78 fps now: 35.217 dB, 3475b/f, 11.57 fps QP=50 before: 37.492 dB, 7983b/f, 6.24 fps now: 37.491 dB, 7974b/f, 7.55 fps PartyScene_832x480_50, speed 1 QP=60 before: 30.104 dB, 22426b/f, 3.28 fps now: 30.109 dB, 22410b/f, 4.43 fps QP=50 before: 33.016 dB, 46984b/f, 2.78 fps now: 33.018 dB, 46998b/f, 3.35 fps into_tree_480p, speed 2 QP=60 before: 35.175 dB, 3506b/f, 10.96 fps now: 35.185 dB, 3510b/f, 13.47 fps QP=50 before: 37.448 dB, 8016b/f, 9.04 fps now: 37.459 dB, 8048b/f, 9.81 fps PartyScene_832x480_50, speed 2 QP=60 before: 30.060 dB, 22537b/f, 4.42 fps now: 30.061 dB, 22541b/f, 5.38 fps QP=50 before: 32.923 dB, 47134b/f, 3.85 fps now: 32.920 dB, 47073b/f, 4.31 fps Change-Id: I674cba4f027c4c65f7837d5ec9179d6201e6ba86 --- vp9/encoder/vp9_encodeframe.c | 100 ++++++++++++++++++++++++++++++++++++--- vp9/encoder/vp9_speed_features.c | 3 ++ vp9/encoder/vp9_speed_features.h | 3 ++ 3 files changed, 100 insertions(+), 6 deletions(-) diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index cfa8322..b151ede 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -3314,6 +3314,79 @@ static int ml_pruning_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd, return 0; } +#define FEATURES 4 +static const float partition_breakout_weights_64[FEATURES + 1] = { + -0.016673f, -0.001025f, -0.000032f, 0.000833f, 1.94261885f - 2.1f, +}; + +static const float partition_breakout_weights_32[FEATURES + 1] = { + -0.010554f, -0.003081f, -0.000134f, 0.004491f, 1.68445992f - 3.5f, +}; + +static const float partition_breakout_weights_16[FEATURES + 1] = { + -0.013154f, -0.002404f, -0.000977f, 0.008450f, 2.57404566f - 5.5f, +}; + +static const float partition_breakout_weights_8[FEATURES + 1] = { + -0.011807f, -0.009873f, -0.000931f, 0.034768f, 1.32254851f - 2.0f, +}; + +// ML-based partition search breakout. +static int ml_predict_breakout(const VP9_COMP *const cpi, BLOCK_SIZE bsize, + const MACROBLOCK *const x, + const RD_COST *const rd_cost) { + DECLARE_ALIGNED(16, static const uint8_t, vp9_64_zeros[64]) = { 0 }; + float features[FEATURES]; + const float *linear_weights = NULL; // Linear model weights. + float linear_score = 0.0f; + + switch (bsize) { + case BLOCK_64X64: linear_weights = partition_breakout_weights_64; break; + case BLOCK_32X32: linear_weights = partition_breakout_weights_32; break; + case BLOCK_16X16: linear_weights = partition_breakout_weights_16; break; + case BLOCK_8X8: linear_weights = partition_breakout_weights_8; break; + default: assert(0 && "Unexpected block size."); return 0; + } + if (!linear_weights) return 0; + + { // Generate feature values. + const VP9_COMMON *const cm = &cpi->common; + const int ac_q = vp9_ac_quant(cm->base_qindex, 0, cm->bit_depth); + const int num_pels_log2 = num_pels_log2_lookup[bsize]; + int feature_index = 0; + unsigned int var, sse; + float rate_f, dist_f; + + var = cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride, + vp9_64_zeros, 0, &sse); + var = var >> num_pels_log2; + + vpx_clear_system_state(); + + rate_f = (float)VPXMIN(rd_cost->rate, INT_MAX); + dist_f = (float)(VPXMIN(rd_cost->dist, INT_MAX) >> num_pels_log2); + rate_f = + ((float)x->rdmult / 128.0f / 512.0f / (float)(1 << num_pels_log2)) * + rate_f; + + features[feature_index++] = rate_f; + features[feature_index++] = dist_f; + features[feature_index++] = (float)var; + features[feature_index++] = (float)ac_q; + assert(feature_index == FEATURES); + } + + { // Calculate the output score. + int i; + linear_score = linear_weights[FEATURES]; + for (i = 0; i < FEATURES; ++i) + linear_score += linear_weights[i] * features[i]; + } + + return linear_score >= 0; +} +#undef FEATURES + // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are // unlikely to be selected depending on previous rate-distortion optimization // results, for encoding speed-up. @@ -3499,12 +3572,27 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, // If all y, u, v transform blocks in this partition are skippable, // and the dist & rate are within the thresholds, the partition search // is terminated for current branch of the partition search tree. - if (!x->e_mbd.lossless && ctx->skippable && - ((best_rdc.dist < (dist_breakout_thr >> 2)) || - (best_rdc.dist < dist_breakout_thr && - best_rdc.rate < rate_breakout_thr))) { - do_split = 0; - do_rect = 0; + if (!x->e_mbd.lossless && ctx->skippable) { + int use_ml_based_breakout = + cpi->sf.use_ml_partition_search_breakout && + cm->base_qindex >= 200; +#if CONFIG_VP9_HIGHBITDEPTH + if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + use_ml_based_breakout = 0; +#endif // CONFIG_VP9_HIGHBITDEPTH + if (use_ml_based_breakout) { + if (ml_predict_breakout(cpi, bsize, x, &this_rdc)) { + do_split = 0; + do_rect = 0; + } + } else { + if ((best_rdc.dist < (dist_breakout_thr >> 2)) || + (best_rdc.dist < dist_breakout_thr && + best_rdc.rate < rate_breakout_thr)) { + do_split = 0; + do_rect = 0; + } + } } } else { // Currently, the machine-learning based partition search early diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index f7e97ce..d1f1250 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -82,6 +82,7 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, } else { sf->disable_split_mask = DISABLE_COMPOUND_SPLIT; sf->partition_search_breakout_thr.dist = (1 << 21); + sf->use_ml_partition_search_breakout = 1; } } @@ -112,6 +113,7 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, } if (speed >= 3) { + sf->use_ml_partition_search_breakout = 0; if (VPXMIN(cm->width, cm->height) >= 720) { sf->disable_split_mask = DISABLE_ALL_SPLIT; sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0; @@ -712,6 +714,7 @@ void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) { sf->partition_search_breakout_thr.dist = (1 << 19); sf->partition_search_breakout_thr.rate = 80; sf->ml_partition_search_early_termination = 0; + sf->use_ml_partition_search_breakout = 0; if (oxcf->mode == REALTIME) { set_rt_speed_feature_framesize_dependent(cpi, sf, oxcf->speed); diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index 251cfdb..09edd1b 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -467,6 +467,9 @@ typedef struct SPEED_FEATURES { // Partition search early breakout thresholds. PARTITION_SEARCH_BREAKOUT_THR partition_search_breakout_thr; + // Use ML-based partition search early breakout. + int use_ml_partition_search_breakout; + // Machine-learning based partition search early termination int ml_partition_search_early_termination; -- 2.7.4