Skip trellis coeff opt based on tx block properties
authorDeepa K G <deepa.kg@ittiam.com>
Thu, 2 Mar 2023 08:09:55 +0000 (13:39 +0530)
committerDeepa K G <deepa.kg@ittiam.com>
Sun, 19 Mar 2023 17:42:04 +0000 (23:12 +0530)
The trellis coefficient optimization is skipped for blocks
with larger residual mse.

                 Instruction Count        BD-Rate Loss(%)
cpu   Resolution   Reduction(%)    avg.psnr   ovr.psnr    ssim
 0       LOWRES2      9.467        0.0921     0.1057    0.0362
 0       MIDRES2      4.328       -0.0155     0.0694    0.0178
 0        HDRES2      1.858        0.0231     0.0214   -0.0034
 0       Average      5.218        0.0332     0.0655    0.0169

STATS_CHANGED

Change-Id: I321a9b1a34ebb59b7b6a065b5b2d717c8767a4a5

vp9/encoder/vp9_encodemb.c
vp9/encoder/vp9_encodemb.h
vp9/encoder/vp9_encoder.h
vp9/encoder/vp9_rdopt.c
vp9/encoder/vp9_speed_features.c
vp9/encoder/vp9_speed_features.h

index 7c61419..c079aa0 100644 (file)
@@ -763,6 +763,8 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col,
   struct encode_b_args arg = { x,
                                1,     // enable_trellis_opt
                                0.0,   // trellis_opt_thresh
+                               NULL,  // &sse_calc_done
+                               NULL,  // &sse
                                NULL,  // above entropy context
                                NULL,  // left entropy context
                                &mi->skip, mi_row, mi_col, output_enabled };
@@ -770,6 +772,8 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col,
   struct encode_b_args arg = { x,
                                1,     // enable_trellis_opt
                                0.0,   // trellis_opt_thresh
+                               NULL,  // &sse_calc_done
+                               NULL,  // &sse
                                NULL,  // above entropy context
                                NULL,  // left entropy context
                                &mi->skip };
@@ -869,7 +873,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
     vpx_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff,
                        diff_stride, src, src_stride, dst, dst_stride);
 #endif
-    enable_trellis_opt = do_trellis_opt(args);
+    enable_trellis_opt = do_trellis_opt(pd, src_diff, diff_stride, row, col,
+                                        plane_bsize, tx_size, args);
   }
 
   if (enable_trellis_opt) {
@@ -1041,7 +1046,9 @@ void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane,
   struct encode_b_args arg = {
     x,
     enable_trellis_opt,
-    0.0,  // trellis_opt_thresh
+    0.0,   // trellis_opt_thresh
+    NULL,  // &sse_calc_done
+    NULL,  // &sse
     ctx.ta[plane],
     ctx.tl[plane],
     &xd->mi[0]->skip,
@@ -1052,7 +1059,9 @@ void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane,
 #else
   struct encode_b_args arg = { x,
                                enable_trellis_opt,
-                               0.0,  // trellis_opt_thresh
+                               0.0,   // trellis_opt_thresh
+                               NULL,  // &sse_calc_done
+                               NULL,  // &sse
                                ctx.ta[plane],
                                ctx.tl[plane],
                                &xd->mi[0]->skip };
index 4091b02..1391446 100644 (file)
@@ -22,6 +22,8 @@ struct encode_b_args {
   MACROBLOCK *x;
   int enable_trellis_opt;
   double trellis_opt_thresh;
+  int *sse_calc_done;
+  int64_t *sse;
   ENTROPY_CONTEXT *ta;
   ENTROPY_CONTEXT *tl;
   int8_t *skip;
index 0e95037..442ef18 100644 (file)
@@ -14,6 +14,7 @@
 #include <stdio.h>
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx/vpx_ext_ratectrl.h"
 #include "vpx/vp8cx.h"
@@ -1478,21 +1479,101 @@ static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) {
   }
 }
 
+static INLINE int num_4x4_to_edge(int plane_4x4_dim, int mb_to_edge_dim,
+                                  int subsampling_dim, int blk_dim) {
+  return plane_4x4_dim + (mb_to_edge_dim >> (5 + subsampling_dim)) - blk_dim;
+}
+
+// Compute the sum of squares on all visible 4x4s in the transform block.
+static int64_t sum_squares_visible(const MACROBLOCKD *xd,
+                                   const struct macroblockd_plane *const pd,
+                                   const int16_t *diff, const int diff_stride,
+                                   int blk_row, int blk_col,
+                                   const BLOCK_SIZE plane_bsize,
+                                   const BLOCK_SIZE tx_bsize,
+                                   int *visible_width, int *visible_height) {
+  int64_t sse;
+  const int plane_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+  const int plane_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+  const int tx_4x4_w = num_4x4_blocks_wide_lookup[tx_bsize];
+  const int tx_4x4_h = num_4x4_blocks_high_lookup[tx_bsize];
+  const int b4x4s_to_right_edge = num_4x4_to_edge(
+      plane_4x4_w, xd->mb_to_right_edge, pd->subsampling_x, blk_col);
+  const int b4x4s_to_bottom_edge = num_4x4_to_edge(
+      plane_4x4_h, xd->mb_to_bottom_edge, pd->subsampling_y, blk_row);
+  if (tx_bsize == BLOCK_4X4 ||
+      (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) {
+    assert(tx_4x4_w == tx_4x4_h);
+    sse = (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, tx_4x4_w << 2);
+    *visible_width = tx_4x4_w << 2;
+    *visible_height = tx_4x4_h << 2;
+  } else {
+    int r, c;
+    const int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h);
+    const int max_c = VPXMIN(b4x4s_to_right_edge, tx_4x4_w);
+    sse = 0;
+    // if we are in the unrestricted motion border.
+    for (r = 0; r < max_r; ++r) {
+      // Skip visiting the sub blocks that are wholly within the UMV.
+      for (c = 0; c < max_c; ++c) {
+        sse += (int64_t)vpx_sum_squares_2d_i16(
+            diff + r * diff_stride * 4 + c * 4, diff_stride, 4);
+      }
+    }
+    *visible_width = max_c << 2;
+    *visible_height = max_r << 2;
+  }
+  return sse;
+}
+
 // Check if trellis coefficient optimization of the transform block is enabled.
-static INLINE int do_trellis_opt(void *arg) {
+static INLINE int do_trellis_opt(const struct macroblockd_plane *pd,
+                                 const int16_t *src_diff, int diff_stride,
+                                 int blk_row, int blk_col,
+                                 BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                                 void *arg) {
   const struct encode_b_args *const args = (struct encode_b_args *)arg;
   const MACROBLOCK *const x = args->x;
-  const int enable_trellis_opt = args->enable_trellis_opt;
-  const double trellis_opt_thresh = args->trellis_opt_thresh;
 
-  switch (enable_trellis_opt) {
+  switch (args->enable_trellis_opt) {
     case DISABLE_TRELLIS_OPT: return 0;
     case ENABLE_TRELLIS_OPT: return 1;
     case ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR: {
-      return (trellis_opt_thresh > 0.0)
-                 ? (x->log_block_src_var <= trellis_opt_thresh)
+      vpx_clear_system_state();
+
+      return (args->trellis_opt_thresh > 0.0)
+                 ? (x->log_block_src_var <= args->trellis_opt_thresh)
                  : 1;
     }
+    case ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE: {
+      const MACROBLOCKD *const xd = &x->e_mbd;
+      const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+#if CONFIG_VP9_HIGHBITDEPTH
+      const int dequant_shift =
+          (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+#else
+      const int dequant_shift = 3;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      const int qstep = pd->dequant[1] >> dequant_shift;
+      int *sse_calc_done = args->sse_calc_done;
+      int64_t *sse = args->sse;
+      int visible_width = 0, visible_height = 0;
+
+      // TODO: Enable the sf for high bit-depth case
+      if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) || !sse ||
+          !sse_calc_done)
+        return 1;
+
+      *sse = sum_squares_visible(xd, pd, src_diff, diff_stride, blk_row,
+                                 blk_col, plane_bsize, tx_bsize, &visible_width,
+                                 &visible_height);
+      *sse_calc_done = 1;
+
+      vpx_clear_system_state();
+
+      return (*(sse) <= (int64_t)visible_width * visible_height * qstep *
+                            qstep * args->trellis_opt_thresh);
+    }
     default: assert(0 && "Invalid trellis optimization method."); return 1;
   }
 }
index 3a68952..88e7b53 100644 (file)
@@ -457,11 +457,6 @@ static int cost_coeffs(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
   return cost;
 }
 
-static INLINE int num_4x4_to_edge(int plane_4x4_dim, int mb_to_edge_dim,
-                                  int subsampling_dim, int blk_dim) {
-  return plane_4x4_dim + (mb_to_edge_dim >> (5 + subsampling_dim)) - blk_dim;
-}
-
 // Copy all visible 4x4s in the transform block.
 static void copy_block_visible(const MACROBLOCKD *xd,
                                const struct macroblockd_plane *const pd,
@@ -562,47 +557,11 @@ static unsigned pixel_sse(const VP9_COMP *const cpi, const MACROBLOCKD *xd,
   return sse;
 }
 
-// Compute the sum of squares on all visible 4x4s in the transform block.
-static int64_t sum_squares_visible(const MACROBLOCKD *xd,
-                                   const struct macroblockd_plane *const pd,
-                                   const int16_t *diff, const int diff_stride,
-                                   int blk_row, int blk_col,
-                                   const BLOCK_SIZE plane_bsize,
-                                   const BLOCK_SIZE tx_bsize) {
-  int64_t sse;
-  const int plane_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
-  const int plane_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
-  const int tx_4x4_w = num_4x4_blocks_wide_lookup[tx_bsize];
-  const int tx_4x4_h = num_4x4_blocks_high_lookup[tx_bsize];
-  int b4x4s_to_right_edge = num_4x4_to_edge(plane_4x4_w, xd->mb_to_right_edge,
-                                            pd->subsampling_x, blk_col);
-  int b4x4s_to_bottom_edge = num_4x4_to_edge(plane_4x4_h, xd->mb_to_bottom_edge,
-                                             pd->subsampling_y, blk_row);
-  if (tx_bsize == BLOCK_4X4 ||
-      (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) {
-    assert(tx_4x4_w == tx_4x4_h);
-    sse = (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, tx_4x4_w << 2);
-  } else {
-    int r, c;
-    int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h);
-    int max_c = VPXMIN(b4x4s_to_right_edge, tx_4x4_w);
-    sse = 0;
-    // if we are in the unrestricted motion border.
-    for (r = 0; r < max_r; ++r) {
-      // Skip visiting the sub blocks that are wholly within the UMV.
-      for (c = 0; c < max_c; ++c) {
-        sse += (int64_t)vpx_sum_squares_2d_i16(
-            diff + r * diff_stride * 4 + c * 4, diff_stride, 4);
-      }
-    }
-  }
-  return sse;
-}
-
 static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
                        BLOCK_SIZE plane_bsize, int block, int blk_row,
                        int blk_col, TX_SIZE tx_size, int64_t *out_dist,
-                       int64_t *out_sse, struct buf_2d *out_recon) {
+                       int64_t *out_sse, struct buf_2d *out_recon,
+                       int sse_calc_done) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -652,8 +611,12 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
     const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
     unsigned int tmp;
 
-    tmp = pixel_sse(cpi, xd, pd, src, src_stride, dst, dst_stride, blk_row,
-                    blk_col, plane_bsize, tx_bsize);
+    if (sse_calc_done) {
+      tmp = (unsigned int)(*out_sse);
+    } else {
+      tmp = pixel_sse(cpi, xd, pd, src, src_stride, dst, dst_stride, blk_row,
+                      blk_col, plane_bsize, tx_bsize);
+    }
     *out_sse = (int64_t)tmp * 16;
     if (out_recon) {
       const int out_recon_idx = 4 * (blk_row * out_recon->stride + blk_col);
@@ -751,25 +714,20 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
   const uint8_t *dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
   const int enable_trellis_opt = args->cpi->sf.trellis_opt_tx_rd.method;
   const double trellis_opt_thresh = args->cpi->sf.trellis_opt_tx_rd.thresh;
+  int sse_calc_done = 0;
 #if CONFIG_MISMATCH_DEBUG
   struct encode_b_args encode_b_arg = {
-    x,
-    enable_trellis_opt,
-    trellis_opt_thresh,
-    args->t_above,
-    args->t_left,
-    &mi->skip,
+    x,    enable_trellis_opt, trellis_opt_thresh, &sse_calc_done,
+    &sse, args->t_above,      args->t_left,       &mi->skip,
     0,  // mi_row
     0,  // mi_col
     0   // output_enabled
   };
 #else
-  struct encode_b_args encode_b_arg = { x,
-                                        enable_trellis_opt,
-                                        trellis_opt_thresh,
-                                        args->t_above,
-                                        args->t_left,
-                                        &mi->skip };
+  struct encode_b_args encode_b_arg = {
+    x,    enable_trellis_opt, trellis_opt_thresh, &sse_calc_done,
+    &sse, args->t_above,      args->t_left,       &mi->skip
+  };
 #endif
 
   if (args->exit_early) return;
@@ -784,16 +742,21 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
     }
     if (x->block_tx_domain) {
       dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
-                 tx_size, &dist, &sse, /*recon =*/0);
+                 tx_size, &dist, &sse, /*recon =*/0, sse_calc_done);
     } else {
       const struct macroblock_plane *const p = &x->plane[plane];
       const int src_stride = p->src.stride;
-      const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
       const uint8_t *src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];
-      const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
       unsigned int tmp;
-      sse = sum_squares_visible(xd, pd, diff, diff_stride, blk_row, blk_col,
-                                plane_bsize, tx_bsize);
+      if (!sse_calc_done) {
+        const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+        const int16_t *diff =
+            &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+        int visible_width, visible_height;
+        sse = sum_squares_visible(xd, pd, diff, diff_stride, blk_row, blk_col,
+                                  plane_bsize, tx_bsize, &visible_width,
+                                  &visible_height);
+      }
 #if CONFIG_VP9_HIGHBITDEPTH
       if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && (xd->bd > 8))
         sse = ROUND64_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
@@ -817,13 +780,19 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
 
     if (skip_txfm_flag == SKIP_TXFM_NONE ||
         (recon && skip_txfm_flag == SKIP_TXFM_AC_ONLY)) {
-      const int enable_trellis_opt = do_trellis_opt(&encode_b_arg);
+      const struct macroblock_plane *const p = &x->plane[plane];
+      const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+      const int16_t *const diff =
+          &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+      const int enable_trellis_opt =
+          do_trellis_opt(pd, diff, diff_stride, blk_row, blk_col, plane_bsize,
+                         tx_size, &encode_b_arg);
       // full forward transform and quantization
       vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
       if (enable_trellis_opt)
         vp9_optimize_b(x, plane, block, tx_size, coeff_ctx);
       dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
-                 tx_size, &dist, &sse, recon);
+                 tx_size, &dist, &sse, recon, sse_calc_done);
     } else if (skip_txfm_flag == SKIP_TXFM_AC_ONLY) {
       // compute DC coefficient
       tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block);
index d07bb34..3e121b7 100644 (file)
@@ -229,6 +229,10 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
   sf->use_square_partition_only = !boosted;
   sf->early_term_interp_search_plane_rd = 1;
   sf->cb_pred_filter_search = 1;
+  sf->trellis_opt_tx_rd.method = sf->optimize_coefficients
+                                     ? ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE
+                                     : DISABLE_TRELLIS_OPT;
+  sf->trellis_opt_tx_rd.thresh = boosted ? 4.0 : 3.0;
 
   sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
 
index fceeb94..d32bf09 100644 (file)
@@ -254,6 +254,9 @@ typedef enum {
   // Enable trellis coefficient optimization based on source variance of the
   // prediction block during transform RD
   ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR,
+  // Enable trellis coefficient optimization based on residual mse of the
+  // transform block during transform RD
+  ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE,
 } ENABLE_TRELLIS_OPT_METHOD;
 
 typedef struct TRELLIS_OPT_CONTROL {