Refactor block_yrd function for RTC coding mode

author Jingning Han <jingning@google.com>

Wed, 1 Apr 2015 00:46:41 +0000 (17:46 -0700)

committer Jingning Han <jingning@google.com>

Wed, 1 Apr 2015 19:00:43 +0000 (12:00 -0700)
author Jingning Han <jingning@google.com>
Wed, 1 Apr 2015 00:46:41 +0000 (17:46 -0700)
committer Jingning Han <jingning@google.com>
Wed, 1 Apr 2015 19:00:43 +0000 (12:00 -0700)
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl

index 887f407..b1a33f5 100644 (file)
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -1171,6 +1171,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
    specialize qw/vp9_block_error avx2/, "$sse2_x86inc";
  
+  add_proto qw/int64_t vp9_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
+  specialize qw/vp9_block_error_fp sse2/;
+
    add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
    specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";
  
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c

index f8a5e6a..e26b4c0 100644 (file)
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -315,6 +315,20 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
    *out_dist_sum += dist << 4;
  }
  
+#if CONFIG_VP9_HIGHBITDEPTH
+static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
+                      int *skippable, int64_t *sse, int plane,
+                      BLOCK_SIZE bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  unsigned int var_y, sse_y;
+  (void)plane;
+  (void)tx_size;
+  model_rd_for_sb_y(cpi, bsize, x, xd, rate, dist, &var_y, &sse_y);
+  *sse = INT_MAX;
+  *skippable = 0;
+  return;
+}
+#else
  static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
                        int *skippable, int64_t *sse, int plane,
                        BLOCK_SIZE bsize, TX_SIZE tx_size) {
@@ -332,23 +346,9 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
    const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
        xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
  
-#if CONFIG_VP9_HIGHBITDEPTH
-  unsigned int var_y, sse_y;
-  model_rd_for_sb_y(cpi, bsize, x, xd, rate, dist, &var_y, &sse_y);
-  *sse = INT_MAX;
-  *skippable = 0;
-  return;
-#else
    (void)cpi;
-#endif
-
    vp9_subtract_plane(x, bsize, plane);
-
    *skippable = 1;
-  *rate = 0;
-  *dist = 0;
-  *sse = 0;
-
    // Keep track of the row and column of the blocks we use so that we know
    // if we are in the unrestricted motion border.
    for (r = 0; r < max_blocks_high; r += block_step) {
@@ -362,7 +362,6 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
          const int diff_stride = 4 * num_4x4_blocks_wide_lookup[bsize];
          int i, j;
          const int16_t *src_diff;
-        int64_t this_sse;
          txfrm_block_to_raster_xy(bsize, tx_size, block, &i, &j);
          src_diff = &p->src_diff[4 * (j * diff_stride + i)];
  
@@ -399,16 +398,36 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
              assert(0);
              break;
          }
+        *skippable &= (*eob == 0);
+      }
+      block += step;
+    }
+  }
+
+  if (*skippable && *sse < INT64_MAX) {
+    *dist = (*sse << 6) >> shift;
+    *sse = *dist;
+    return;
+  }
  
-        *dist += vp9_block_error(coeff, dqcoeff, step << 4, &this_sse) >> shift;
+  block = 0;
+  *rate = 0;
+  *dist = 0;
+  *sse = (*sse << 6) >> shift;
+  for (r = 0; r < max_blocks_high; r += block_step) {
+    for (c = 0; c < num_4x4_w; c += block_step) {
+      if (c < max_blocks_wide) {
+        tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+        tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+        tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+        uint16_t *const eob = &p->eobs[block];
  
          if (*eob == 1)
            *rate += (int)abs(qcoeff[0]);
          else if (*eob > 1)
            *rate += (int)vp9_satd((const int16_t *)qcoeff, step << 4);
  
-        *sse += (this_sse >> shift);
-        *skippable &= (*eob == 0);
+        *dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> shift;
        }
        block += step;
      }
@@ -417,6 +436,7 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
    *rate <<= 8;
    *rate *= 6;
  }
+#endif
  
  static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE bsize,
                                 MACROBLOCK *x, MACROBLOCKD *xd,
@@ -624,7 +644,7 @@ static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
    int i, j;
    int rate;
    int64_t dist;
-  int64_t this_sse;
+  int64_t this_sse = INT64_MAX;
    int is_skippable;
  
    txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
@@ -1074,6 +1094,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
      }
  
      if (bsize <= BLOCK_16X16) {
+      this_sse = (int64_t)sse_y;
        block_yrd(cpi, x, &this_rdc.rate, &this_rdc.dist, &is_skippable,
                  &this_sse, 0, bsize, mbmi->tx_size);
        x->skip_txfm[0] = is_skippable;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c

index e8c3f76..166535b 100644 (file)
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -292,6 +292,18 @@ int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
    return error;
  }
  
+int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff,
+                             int block_size) {
+  int i;
+  int64_t error = 0;
+
+  for (i = 0; i < block_size; i++) {
+    const int diff = coeff[i] - dqcoeff[i];
+    error +=  diff * diff;
+  }
+
+  return error;
+}
  
  #if CONFIG_VP9_HIGHBITDEPTH
  int64_t vp9_highbd_block_error_c(const tran_low_t *coeff,
diff --git a/vp9/encoder/x86/vp9_error_sse2.asm b/vp9/encoder/x86/vp9_error_sse2.asm

index 1126fdb..3183797 100644 (file)
--- a/vp9/encoder/x86/vp9_error_sse2.asm
+++ b/vp9/encoder/x86/vp9_error_sse2.asm
@@ -72,3 +72,49 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
    movd    edx, m5
  %endif
    RET
+
+; Compute the sum of squared difference between two int16_t vectors.
+; int64_t vp9_block_error_fp(int16_t *coeff, int16_t *dqcoeff,
+;                            intptr_t block_size)
+
+INIT_XMM sse2
+cglobal block_error_fp, 3, 3, 8, uqc, dqc, size
+  pxor      m4, m4                 ; sse accumulator
+  pxor      m5, m5                 ; dedicated zero register
+  lea     uqcq, [uqcq+sizeq*2]
+  lea     dqcq, [dqcq+sizeq*2]
+  neg    sizeq
+.loop:
+  mova      m2, [uqcq+sizeq*2]
+  mova      m0, [dqcq+sizeq*2]
+  mova      m3, [uqcq+sizeq*2+mmsize]
+  mova      m1, [dqcq+sizeq*2+mmsize]
+  psubw     m0, m2
+  psubw     m1, m3
+  ; individual errors are max. 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+  pmaddwd   m0, m0
+  pmaddwd   m1, m1
+  ; accumulate in 64bit
+  punpckldq m7, m0, m5
+  punpckhdq m0, m5
+  paddq     m4, m7
+  punpckldq m7, m1, m5
+  paddq     m4, m0
+  punpckhdq m1, m5
+  paddq     m4, m7
+  paddq     m4, m1
+  add    sizeq, mmsize
+  jl .loop
+
+  ; accumulate horizontally and store in return value
+  movhlps   m5, m4
+  paddq     m4, m5
+%if ARCH_X86_64
+  movq    rax, m4
+%else
+  pshufd   m5, m4, 0x1
+  movd    eax, m4
+  movd    edx, m5
+%endif
+  RET
author	Jingning Han <jingning@google.com>
	Wed, 1 Apr 2015 00:46:41 +0000 (17:46 -0700)
committer	Jingning Han <jingning@google.com>
	Wed, 1 Apr 2015 19:00:43 +0000 (12:00 -0700)
vp9/common/vp9_rtcd_defs.pl		patch \| blob \| history
vp9/encoder/vp9_pickmode.c		patch \| blob \| history
vp9/encoder/vp9_rdopt.c		patch \| blob \| history
vp9/encoder/x86/vp9_error_sse2.asm		patch \| blob \| history