Add joint motion search in comp_inter_inter mode(experiment)

author Yunqing Wang <yunqingwang@google.com>

Tue, 7 May 2013 16:45:28 +0000 (09:45 -0700)

committer Yunqing Wang <yunqingwang@google.com>

Fri, 10 May 2013 17:15:43 +0000 (10:15 -0700)
author Yunqing Wang <yunqingwang@google.com>
Tue, 7 May 2013 16:45:28 +0000 (09:45 -0700)
committer Yunqing Wang <yunqingwang@google.com>
Fri, 10 May 2013 17:15:43 +0000 (10:15 -0700)
diff --git a/configure b/configure

index 5cbf070..cc8c581 100755 (executable)
--- a/configure
+++ b/configure
@@ -247,6 +247,7 @@ EXPERIMENT_LIST="
      multiple_arf
      non420
      ab4x4
+    comp_inter_joint_search
  "
  CONFIG_LIST="
      external_build
diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h

index 1663195..2f67074 100644 (file)
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -36,6 +36,7 @@ typedef enum BLOCK_SIZE_TYPE {
    BLOCK_SIZE_SB32X64,
    BLOCK_SIZE_SB64X32,
    BLOCK_SIZE_SB64X64,
+  BLOCK_SIZE_TYPES
  } BLOCK_SIZE_TYPE;
  
  typedef enum PARTITION_TYPE {
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh

index 75e3604..abae952 100644 (file)
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -337,41 +337,74 @@ vp9_variance4x4_mmx=vp9_variance4x4_mmx
  prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
  specialize vp9_sub_pixel_variance64x64 sse2
  
+prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance64x64
+
  prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
  specialize vp9_sub_pixel_variance32x64
  
+prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance32x64
+
  prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
  specialize vp9_sub_pixel_variance64x32
  
+prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance64x32
+
  prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
  specialize vp9_sub_pixel_variance32x16
  
+prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance32x16
+
  prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
  specialize vp9_sub_pixel_variance16x32
  
+prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance16x32
+
  prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
  specialize vp9_sub_pixel_variance32x32 sse2
  
+prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance32x32
+
  prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
  specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3
  
+prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance16x16
+
  prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
  specialize vp9_sub_pixel_variance8x16 sse2 mmx
  vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt
  
+prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance8x16
+
  prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
  specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3
  vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3;
  vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt
  
+prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance16x8
+
  prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
  specialize vp9_sub_pixel_variance8x8 sse2 mmx
  vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt
  
+prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance8x8
+
  prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
  specialize vp9_sub_pixel_variance4x4 sse2 mmx
  vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
  
+prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance4x4
+
  prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
  specialize vp9_sad64x64 sse2
  
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c

index 74caba5..aff5637 100644 (file)
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -413,6 +413,201 @@ int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x,
  
    return besterr;
  }
+
+#if CONFIG_COMP_INTER_JOINT_SEARCH
+#undef DIST
+/* returns subpixel variance error function */
+#define DIST(r, c) \
+    vfp->svaf(PRE(r, c), y_stride, SP(c), SP(r), \
+              z, src_stride, &sse, second_pred)
+
+int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
+                                 int_mv *bestmv, int_mv *ref_mv,
+                                 int error_per_bit,
+                                 const vp9_variance_fn_ptr_t *vfp,
+                                 int *mvjcost, int *mvcost[2],
+                                 int *distortion,
+                                 unsigned int *sse1,
+                                 const uint8_t *second_pred, int w, int h) {
+  uint8_t *z = x->plane[0].src.buf;
+  int src_stride = x->plane[0].src.stride;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  int rr, rc, br, bc, hstep;
+  int tr, tc;
+  unsigned int besterr = INT_MAX;
+  unsigned int left, right, up, down, diag;
+  unsigned int sse;
+  unsigned int whichdir;
+  unsigned int halfiters = 4;
+  unsigned int quarteriters = 4;
+  unsigned int eighthiters = 4;
+  int thismse;
+  int maxc, minc, maxr, minr;
+  int y_stride;
+  int offset;
+  int usehp = xd->allow_high_precision_mv;
+
+  uint8_t *comp_pred = vpx_memalign(16, w * h * sizeof(uint8_t));
+  uint8_t *y = xd->plane[0].pre[0].buf +
+               (bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
+               bestmv->as_mv.col;
+
+  y_stride = xd->plane[0].pre[0].stride;
+
+  rr = ref_mv->as_mv.row;
+  rc = ref_mv->as_mv.col;
+  br = bestmv->as_mv.row << 3;
+  bc = bestmv->as_mv.col << 3;
+  hstep = 4;
+  minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) -
+             ((1 << MV_MAX_BITS) - 1));
+  maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) +
+             ((1 << MV_MAX_BITS) - 1));
+  minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) -
+             ((1 << MV_MAX_BITS) - 1));
+  maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) +
+             ((1 << MV_MAX_BITS) - 1));
+
+  tr = br;
+  tc = bc;
+
+
+  offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
+
+  // central mv
+  bestmv->as_mv.row <<= 3;
+  bestmv->as_mv.col <<= 3;
+
+  // calculate central point error
+  // TODO(yunqingwang): central pointer error was already calculated in full-
+  // pixel search, and can be passed in this function.
+  comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+  besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost,
+                         error_per_bit, xd->allow_high_precision_mv);
+
+  // Each subsequent iteration checks at least one point in
+  // common with the last iteration could be 2 ( if diag selected)
+  while (--halfiters) {
+    // 1/2 pel
+    CHECK_BETTER(left, tr, tc - hstep);
+    CHECK_BETTER(right, tr, tc + hstep);
+    CHECK_BETTER(up, tr - hstep, tc);
+    CHECK_BETTER(down, tr + hstep, tc);
+
+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+    switch (whichdir) {
+      case 0:
+        CHECK_BETTER(diag, tr - hstep, tc - hstep);
+        break;
+      case 1:
+        CHECK_BETTER(diag, tr - hstep, tc + hstep);
+        break;
+      case 2:
+        CHECK_BETTER(diag, tr + hstep, tc - hstep);
+        break;
+      case 3:
+        CHECK_BETTER(diag, tr + hstep, tc + hstep);
+        break;
+    }
+
+    // no reason to check the same one again.
+    if (tr == br && tc == bc)
+      break;
+
+    tr = br;
+    tc = bc;
+  }
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+  hstep >>= 1;
+  while (--quarteriters) {
+    CHECK_BETTER(left, tr, tc - hstep);
+    CHECK_BETTER(right, tr, tc + hstep);
+    CHECK_BETTER(up, tr - hstep, tc);
+    CHECK_BETTER(down, tr + hstep, tc);
+
+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+    switch (whichdir) {
+      case 0:
+        CHECK_BETTER(diag, tr - hstep, tc - hstep);
+        break;
+      case 1:
+        CHECK_BETTER(diag, tr - hstep, tc + hstep);
+        break;
+      case 2:
+        CHECK_BETTER(diag, tr + hstep, tc - hstep);
+        break;
+      case 3:
+        CHECK_BETTER(diag, tr + hstep, tc + hstep);
+        break;
+    }
+
+    // no reason to check the same one again.
+    if (tr == br && tc == bc)
+      break;
+
+    tr = br;
+    tc = bc;
+  }
+
+  if (xd->allow_high_precision_mv) {
+    usehp = vp9_use_nmv_hp(&ref_mv->as_mv);
+  } else {
+    usehp = 0;
+  }
+
+  if (usehp) {
+    hstep >>= 1;
+    while (--eighthiters) {
+      CHECK_BETTER(left, tr, tc - hstep);
+      CHECK_BETTER(right, tr, tc + hstep);
+      CHECK_BETTER(up, tr - hstep, tc);
+      CHECK_BETTER(down, tr + hstep, tc);
+
+      whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+      switch (whichdir) {
+        case 0:
+          CHECK_BETTER(diag, tr - hstep, tc - hstep);
+          break;
+        case 1:
+          CHECK_BETTER(diag, tr - hstep, tc + hstep);
+          break;
+        case 2:
+          CHECK_BETTER(diag, tr + hstep, tc - hstep);
+          break;
+        case 3:
+          CHECK_BETTER(diag, tr + hstep, tc + hstep);
+          break;
+      }
+
+      // no reason to check the same one again.
+      if (tr == br && tc == bc)
+        break;
+
+      tr = br;
+      tc = bc;
+    }
+  }
+  bestmv->as_mv.row = br;
+  bestmv->as_mv.col = bc;
+
+  vpx_free(comp_pred);
+
+  if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+#endif  // CONFIG_COMP_INTER_JOINT_SEARCH
+
  #undef MVC
  #undef PRE
  #undef DIST
@@ -2132,7 +2327,109 @@ int vp9_refining_search_sadx4(MACROBLOCK *x,
      return INT_MAX;
  }
  
+#if CONFIG_COMP_INTER_JOINT_SEARCH
+/* This function is called when we do joint motion search in comp_inter_inter
+ * mode.
+ */
+int vp9_refining_search_8p_c(MACROBLOCK *x,
+                             int_mv *ref_mv, int error_per_bit,
+                             int search_range, vp9_variance_fn_ptr_t *fn_ptr,
+                             int *mvjcost, int *mvcost[2], int_mv *center_mv,
+                             const uint8_t *second_pred, int w, int h) {
+  const MACROBLOCKD* const xd = &x->e_mbd;
+  MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0},
+      {-1, -1}, {1, -1}, {-1, 1}, {1, 1}};
+  int i, j;
+  int this_row_offset, this_col_offset;
  
+  int what_stride = x->plane[0].src.stride;
+  int in_what_stride = xd->plane[0].pre[0].stride;
+  uint8_t *what = x->plane[0].src.buf;
+  uint8_t *best_address = xd->plane[0].pre[0].buf +
+                          (ref_mv->as_mv.row * xd->plane[0].pre[0].stride) +
+                          ref_mv->as_mv.col;
+  uint8_t *check_here;
+  unsigned int thissad;
+  int_mv this_mv;
+  unsigned int bestsad = INT_MAX;
+  int_mv fcenter_mv;
+
+  int *mvjsadcost = x->nmvjointsadcost;
+  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
+
+  /* Compound pred buffer */
+  uint8_t *comp_pred = vpx_memalign(16, w * h * sizeof(uint8_t));
+
+  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+  /* Get compound pred by averaging two pred blocks. */
+  comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
+
+  bestsad = fn_ptr->sdf(what, what_stride, comp_pred, w, 0x7fffffff) +
+      mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
+
+  for (i = 0; i < search_range; i++) {
+    int best_site = -1;
+
+    for (j = 0; j < 8; j++) {
+      this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
+      this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
+
+      if ((this_col_offset > x->mv_col_min) &&
+          (this_col_offset < x->mv_col_max) &&
+          (this_row_offset > x->mv_row_min) &&
+          (this_row_offset < x->mv_row_max)) {
+        check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col +
+            best_address;
+
+        /* Get compound block and use it to calculate SAD. */
+        comp_avg_pred(comp_pred, second_pred, w, h, check_here,
+                      in_what_stride);
+        thissad = fn_ptr->sdf(what, what_stride, comp_pred, w, bestsad);
+
+        if (thissad < bestsad) {
+          this_mv.as_mv.row = this_row_offset;
+          this_mv.as_mv.col = this_col_offset;
+          thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost,
+                                    mvsadcost, error_per_bit);
+
+          if (thissad < bestsad) {
+            bestsad = thissad;
+            best_site = j;
+          }
+        }
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
+      ref_mv->as_mv.row += neighbors[best_site].row;
+      ref_mv->as_mv.col += neighbors[best_site].col;
+      best_address += (neighbors[best_site].row) * in_what_stride +
+          neighbors[best_site].col;
+    }
+  }
+
+  this_mv.as_mv.row = ref_mv->as_mv.row << 3;
+  this_mv.as_mv.col = ref_mv->as_mv.col << 3;
+
+  if (bestsad < INT_MAX) {
+    int besterr;
+    comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
+    besterr = fn_ptr->vf(what, what_stride, comp_pred, w,
+        (unsigned int *)(&thissad)) +
+        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit,
+                    xd->allow_high_precision_mv);
+    vpx_free(comp_pred);
+    return besterr;
+  } else {
+    vpx_free(comp_pred);
+    return INT_MAX;
+  }
+}
+#endif  // CONFIG_COMP_INTER_JOINT_SEARCH
  
  #ifdef ENTROPY_STATS
  void print_mode_context(VP9_COMMON *pc) {
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h

index e1ba7fd..cdbd29a 100644 (file)
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -79,5 +79,21 @@ typedef int (*vp9_diamond_search_fn_t)(MACROBLOCK *x,
                                         int *mvjcost, int *mvcost[2],
                                         int_mv *center_mv);
  
+#if CONFIG_COMP_INTER_JOINT_SEARCH
+int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
+                                 int_mv *bestmv, int_mv *ref_mv,
+                                 int error_per_bit,
+                                 const vp9_variance_fn_ptr_t *vfp,
+                                 int *mvjcost, int *mvcost[2],
+                                 int *distortion, unsigned int *sse1,
+                                 const uint8_t *second_pred,
+                                 int w, int h);
  
+int vp9_refining_search_8p_c(MACROBLOCK *x,
+                             int_mv *ref_mv, int error_per_bit,
+                             int search_range, vp9_variance_fn_ptr_t *fn_ptr,
+                             int *mvjcost, int *mvcost[2],
+                             int_mv *center_mv, const uint8_t *second_pred,
+                             int w, int h);
+#endif  // CONFIG_COMP_INTER_JOINT_SEARCH
  #endif  // VP9_ENCODER_VP9_MCOMP_H_
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c

index 782816f..05105d7 100644 (file)
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -1516,10 +1516,11 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
    for (i = 0; i < MAX_MODES; i++)
      cpi->rd_thresh_mult[i] = 128;
  
-#define BFP(BT, SDF, VF, SVF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF) \
+#define BFP(BT, SDF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF)\
      cpi->fn_ptr[BT].sdf            = SDF; \
      cpi->fn_ptr[BT].vf             = VF; \
      cpi->fn_ptr[BT].svf            = SVF; \
+    cpi->fn_ptr[BT].svaf           = SVAF; \
      cpi->fn_ptr[BT].svf_halfpix_h  = SVFHH; \
      cpi->fn_ptr[BT].svf_halfpix_v  = SVFHV; \
      cpi->fn_ptr[BT].svf_halfpix_hv = SVFHHV; \
@@ -1528,57 +1529,64 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
      cpi->fn_ptr[BT].sdx4df         = SDX4DF;
  
    BFP(BLOCK_32X16, vp9_sad32x16, vp9_variance32x16, vp9_sub_pixel_variance32x16,
-      NULL, NULL,
+      vp9_sub_pixel_avg_variance32x16, NULL, NULL,
        NULL, NULL, NULL,
        vp9_sad32x16x4d)
  
    BFP(BLOCK_16X32, vp9_sad16x32, vp9_variance16x32, vp9_sub_pixel_variance16x32,
-      NULL, NULL,
+      vp9_sub_pixel_avg_variance16x32, NULL, NULL,
        NULL, NULL, NULL,
        vp9_sad16x32x4d)
  
    BFP(BLOCK_64X32, vp9_sad64x32, vp9_variance64x32, vp9_sub_pixel_variance64x32,
-      NULL, NULL,
+      vp9_sub_pixel_avg_variance64x32, NULL, NULL,
        NULL, NULL, NULL,
        vp9_sad64x32x4d)
  
    BFP(BLOCK_32X64, vp9_sad32x64, vp9_variance32x64, vp9_sub_pixel_variance32x64,
-      NULL, NULL,
+      vp9_sub_pixel_avg_variance32x64, NULL, NULL,
        NULL, NULL, NULL,
        vp9_sad32x64x4d)
  
    BFP(BLOCK_32X32, vp9_sad32x32, vp9_variance32x32, vp9_sub_pixel_variance32x32,
-      vp9_variance_halfpixvar32x32_h, vp9_variance_halfpixvar32x32_v,
+      vp9_sub_pixel_avg_variance32x32, vp9_variance_halfpixvar32x32_h,
+      vp9_variance_halfpixvar32x32_v,
        vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8,
        vp9_sad32x32x4d)
  
    BFP(BLOCK_64X64, vp9_sad64x64, vp9_variance64x64, vp9_sub_pixel_variance64x64,
-      vp9_variance_halfpixvar64x64_h, vp9_variance_halfpixvar64x64_v,
+      vp9_sub_pixel_avg_variance64x64, vp9_variance_halfpixvar64x64_h,
+      vp9_variance_halfpixvar64x64_v,
        vp9_variance_halfpixvar64x64_hv, vp9_sad64x64x3, vp9_sad64x64x8,
        vp9_sad64x64x4d)
  
    BFP(BLOCK_16X16, vp9_sad16x16, vp9_variance16x16, vp9_sub_pixel_variance16x16,
-       vp9_variance_halfpixvar16x16_h, vp9_variance_halfpixvar16x16_v,
-       vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
-       vp9_sad16x16x4d)
+      vp9_sub_pixel_avg_variance16x16, vp9_variance_halfpixvar16x16_h,
+      vp9_variance_halfpixvar16x16_v,
+      vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
+      vp9_sad16x16x4d)
  
    BFP(BLOCK_16X8, vp9_sad16x8, vp9_variance16x8, vp9_sub_pixel_variance16x8,
-      NULL, NULL, NULL, vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
+      vp9_sub_pixel_avg_variance16x8, NULL, NULL, NULL,
+      vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
  
    BFP(BLOCK_8X16, vp9_sad8x16, vp9_variance8x16, vp9_sub_pixel_variance8x16,
-      NULL, NULL, NULL, vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
+      vp9_sub_pixel_avg_variance8x16, NULL, NULL, NULL,
+      vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
  
    BFP(BLOCK_8X8, vp9_sad8x8, vp9_variance8x8, vp9_sub_pixel_variance8x8,
-      NULL, NULL, NULL, vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
+      vp9_sub_pixel_avg_variance8x8, NULL, NULL, NULL,
+      vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
  
    BFP(BLOCK_4X8, NULL, vp9_variance4x8, NULL,
-      NULL, NULL, NULL, NULL, NULL, NULL)
+      NULL, NULL, NULL, NULL, NULL, NULL, NULL)
  
    BFP(BLOCK_8X4, NULL, vp9_variance8x4, NULL,
-      NULL, NULL, NULL, NULL, NULL, NULL)
+      NULL, NULL, NULL, NULL, NULL, NULL, NULL)
  
    BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,
-      NULL, NULL, NULL, vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
+      vp9_sub_pixel_avg_variance4x4, NULL, NULL, NULL,
+      vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
  
    cpi->full_search_sad = vp9_full_search_sad;
    cpi->diamond_search_sad = vp9_diamond_search_sad;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c

index 1b143f5..24b81c0 100644 (file)
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1807,8 +1807,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                   INTERPOLATIONFILTERTYPE *best_filter,
                                   int_mv frame_mv[MB_MODE_COUNT]
                                                  [MAX_REF_FRAMES],
-                                 YV12_BUFFER_CONFIG *scaled_ref_frame,
-                                 int mi_row, int mi_col) {
+                                 YV12_BUFFER_CONFIG **scaled_ref_frame,
+                                 int mi_row, int mi_col,
+                                 int_mv single_newmv[MAX_REF_FRAMES]) {
    const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize);
  
    VP9_COMMON *cm = &cpi->common;
@@ -1838,6 +1839,152 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
        ref_mv[1] = mbmi->ref_mvs[refs[1]][0];
  
        if (is_comp_pred) {
+#if CONFIG_COMP_INTER_JOINT_SEARCH
+        const int b_sz[BLOCK_SIZE_TYPES][2] = {
+            {4, 4},
+            {8, 8},
+            {8, 16},
+            {16, 8},
+            {16, 16},
+            {16, 32},
+            {32, 16},
+            {32, 32},
+            {32, 64},
+            {64, 32},
+            {64, 64}
+        };
+
+        int ite;
+        // Prediction buffer from second frame.
+        uint8_t *second_pred = vpx_memalign(16, b_sz[bsize][0] *
+                                            b_sz[bsize][1] * sizeof(uint8_t));
+
+        // Do joint motion search in compound mode to get more accurate mv.
+        struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
+        struct buf_2d backup_second_yv12[MAX_MB_PLANE] = {{0}};
+        struct buf_2d scaled_first_yv12;
+
+        if (scaled_ref_frame[0]) {
+          int i;
+
+          // Swap out the reference frame for a version that's been scaled to
+          // match the resolution of the current frame, allowing the existing
+          // motion search code to be used without additional modifications.
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            backup_yv12[i] = xd->plane[i].pre[0];
+
+          setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
+                           NULL, NULL);
+        }
+
+        if (scaled_ref_frame[1]) {
+          int i;
+
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            backup_second_yv12[i] = xd->plane[i].pre[1];
+
+          setup_pre_planes(xd, scaled_ref_frame[1], NULL, mi_row, mi_col,
+                           NULL, NULL);
+        }
+        xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0],
+                                                mi_row, mi_col);
+        xd->scale_factor[1].set_scaled_offsets(&xd->scale_factor[1],
+                                                mi_row, mi_col);
+
+        scaled_first_yv12 = xd->plane[0].pre[0];
+
+        // Initialize mv using single prediction mode result.
+        frame_mv[NEWMV][refs[0]].as_int = single_newmv[refs[0]].as_int;
+        frame_mv[NEWMV][refs[1]].as_int = single_newmv[refs[1]].as_int;
+
+        // Iteration: joint search is done once for each ref frame.
+        // Tried allowing search multiple times iteratively, and break out if
+        // it couldn't find better mv. But tests didn't show noticeable
+        // improvement.
+        for (ite = 0; ite < 2; ite++) {
+          struct buf_2d ref_yv12[2] = {xd->plane[0].pre[0],
+                                       xd->plane[0].pre[1]};
+          int bestsme = INT_MAX;
+          int sadpb = x->sadperbit16;
+          int_mv tmp_mv;
+          int search_range = 3;
+
+          int tmp_col_min = x->mv_col_min;
+          int tmp_col_max = x->mv_col_max;
+          int tmp_row_min = x->mv_row_min;
+          int tmp_row_max = x->mv_row_max;
+          int id = ite % 2;
+
+          // Get pred block from second frame.
+          vp9_build_inter_predictor(ref_yv12[!id].buf,
+                                    ref_yv12[!id].stride,
+                                    second_pred, b_sz[bsize][0],
+                                    &frame_mv[NEWMV][refs[!id]],
+                                    &xd->scale_factor[!id],
+                                    b_sz[bsize][0], b_sz[bsize][1], 0,
+                                    &xd->subpix);
+
+          // Compound motion search on first ref frame.
+          if (id)
+            xd->plane[0].pre[0] = ref_yv12[id];
+          vp9_clamp_mv_min_max(x, &ref_mv[id]);
+
+          // Use mv result from single mode as mvp.
+          tmp_mv.as_int = frame_mv[NEWMV][refs[id]].as_int;
+
+          tmp_mv.as_mv.col >>= 3;
+          tmp_mv.as_mv.row >>= 3;
+
+          // Small-range full-pixel motion search
+          bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
+                                             search_range,
+                                             &cpi->fn_ptr[block_size],
+                                             x->nmvjointcost, x->mvcost,
+                                             &ref_mv[id], second_pred,
+                                             b_sz[bsize][0], b_sz[bsize][1]);
+
+          x->mv_col_min = tmp_col_min;
+          x->mv_col_max = tmp_col_max;
+          x->mv_row_min = tmp_row_min;
+          x->mv_row_max = tmp_row_max;
+
+          if (bestsme < INT_MAX) {
+            int dis; /* TODO: use dis in distortion calculation later. */
+            unsigned int sse;
+
+            vp9_find_best_sub_pixel_comp(x, &tmp_mv,
+                                         &ref_mv[id],
+                                         x->errorperbit,
+                                         &cpi->fn_ptr[block_size],
+                                         x->nmvjointcost, x->mvcost,
+                                         &dis, &sse, second_pred,
+                                         b_sz[bsize][0], b_sz[bsize][1]);
+          }
+
+          frame_mv[NEWMV][refs[id]].as_int =
+              xd->mode_info_context->bmi[0].as_mv[1].as_int = tmp_mv.as_int;
+          if (id)
+            xd->plane[0].pre[0] = scaled_first_yv12;
+        }
+
+        // restore the predictor
+        if (scaled_ref_frame[0]) {
+          int i;
+
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            xd->plane[i].pre[0] = backup_yv12[i];
+        }
+
+        if (scaled_ref_frame[1]) {
+          int i;
+
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            xd->plane[i].pre[1] = backup_second_yv12[i];
+        }
+
+        vpx_free(second_pred);
+#endif  // CONFIG_COMP_INTER_JOINT_SEARCH
+
          if (frame_mv[NEWMV][refs[0]].as_int == INVALID_MV ||
              frame_mv[NEWMV][refs[1]].as_int == INVALID_MV)
            return INT64_MAX;
@@ -1862,7 +2009,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
          int tmp_row_min = x->mv_row_min;
          int tmp_row_max = x->mv_row_max;
  
-        if (scaled_ref_frame) {
+        if (scaled_ref_frame[0]) {
            int i;
  
            // Swap out the reference frame for a version that's been scaled to
@@ -1871,7 +2018,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
            for (i = 0; i < MAX_MB_PLANE; i++)
              backup_yv12[i] = xd->plane[i].pre[0];
  
-          setup_pre_planes(xd, scaled_ref_frame, NULL, mi_row, mi_col,
+          setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
                             NULL, NULL);
          }
  
@@ -1914,6 +2061,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
          }
          frame_mv[NEWMV][refs[0]].as_int =
            xd->mode_info_context->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
+        single_newmv[refs[0]].as_int = tmp_mv.as_int;
  
          // Add the new motion vector cost to our rolling cost variable
          *rate2 += vp9_mv_bit_cost(&tmp_mv, &ref_mv[0],
@@ -1921,7 +2069,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                    96, xd->allow_high_precision_mv);
  
          // restore the predictor, if required
-        if (scaled_ref_frame) {
+        if (scaled_ref_frame[0]) {
            int i;
  
            for (i = 0; i < MAX_MB_PLANE; i++)
@@ -2205,6 +2353,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
    int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
    int frame_mdcounts[4][4];
    struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+  int_mv single_newmv[MAX_REF_FRAMES];
    static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                      VP9_ALT_FLAG };
    int idx_list[4] = {0,
@@ -2251,6 +2400,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
    xd->mode_info_context->mbmi.segment_id = segment_id;
    estimate_ref_frame_costs(cpi, segment_id, ref_costs);
    vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
+  vpx_memset(&single_newmv, 0, sizeof(single_newmv));
  
    for (i = 0; i < NB_PREDICTION_TYPES; ++i)
      best_pred_rd[i] = INT64_MAX;
@@ -2608,7 +2758,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
            vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP), is_comp_pred);
        mbmi->mode = this_mode;
      } else {
-      YV12_BUFFER_CONFIG *scaled_ref_frame = NULL;
+      YV12_BUFFER_CONFIG *scaled_ref_frame[2] = {NULL, NULL};
        int fb;
  
        if (mbmi->ref_frame == LAST_FRAME) {
@@ -2620,7 +2770,20 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
        }
  
        if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])
-        scaled_ref_frame = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
+        scaled_ref_frame[0] = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
+
+      if (comp_pred) {
+        if (mbmi->second_ref_frame == LAST_FRAME) {
+          fb = cpi->lst_fb_idx;
+        } else if (mbmi->second_ref_frame == GOLDEN_FRAME) {
+          fb = cpi->gld_fb_idx;
+        } else {
+          fb = cpi->alt_fb_idx;
+        }
+
+        if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])
+          scaled_ref_frame[1] = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
+      }
  
        this_rd = handle_inter_mode(cpi, x, bsize,
                                    mdcounts, txfm_cache,
@@ -2630,7 +2793,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                    &rate_uv, &distortion_uv,
                                    &mode_excluded, &disable_skip,
                                    mode_index, &tmp_best_filter, frame_mv,
-                                  scaled_ref_frame, mi_row, mi_col);
+                                  scaled_ref_frame, mi_row, mi_col,
+                                  single_newmv);
        if (this_rd == INT64_MAX)
          continue;
      }
diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h

index 13dabbd..306476b 100644 (file)
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -12,6 +12,7 @@
  #define VP9_ENCODER_VP9_VARIANCE_H_
  
  #include "vpx/vpx_integer.h"
+// #include "./vpx_config.h"
  
  typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
                                      int source_stride,
@@ -50,6 +51,15 @@ typedef unsigned int (*vp9_subpixvariance_fn_t)(const uint8_t *src_ptr,
                                                  int Refstride,
                                                  unsigned int *sse);
  
+typedef unsigned int (*vp9_subp_avg_variance_fn_t)(const uint8_t *src_ptr,
+                                                   int source_stride,
+                                                   int xoffset,
+                                                   int yoffset,
+                                                   const uint8_t *ref_ptr,
+                                                   int Refstride,
+                                                   unsigned int *sse,
+                                                   const uint8_t *second_pred);
+
  typedef void (*vp9_ssimpf_fn_t)(uint8_t *s, int sp, uint8_t *r,
                                  int rp, unsigned long *sum_s,
                                  unsigned long *sum_r, unsigned long *sum_sq_s,
@@ -64,15 +74,33 @@ typedef unsigned int (*vp9_get16x16prederror_fn_t)(const uint8_t *src_ptr,
                                                     int  ref_stride);
  
  typedef struct vp9_variance_vtable {
-    vp9_sad_fn_t            sdf;
-    vp9_variance_fn_t       vf;
-    vp9_subpixvariance_fn_t svf;
-    vp9_variance_fn_t       svf_halfpix_h;
-    vp9_variance_fn_t       svf_halfpix_v;
-    vp9_variance_fn_t       svf_halfpix_hv;
-    vp9_sad_multi_fn_t      sdx3f;
-    vp9_sad_multi1_fn_t     sdx8f;
-    vp9_sad_multi_d_fn_t    sdx4df;
+    vp9_sad_fn_t               sdf;
+    vp9_variance_fn_t          vf;
+    vp9_subpixvariance_fn_t    svf;
+    vp9_subp_avg_variance_fn_t svaf;
+    vp9_variance_fn_t          svf_halfpix_h;
+    vp9_variance_fn_t          svf_halfpix_v;
+    vp9_variance_fn_t          svf_halfpix_hv;
+    vp9_sad_multi_fn_t         sdx3f;
+    vp9_sad_multi1_fn_t        sdx8f;
+    vp9_sad_multi_d_fn_t       sdx4df;
  } vp9_variance_fn_ptr_t;
  
+// #if CONFIG_COMP_INTER_JOINT_SEARCH
+static void comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int weight,
+                          int height, uint8_t *ref, int ref_stride) {
+  int i, j;
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < weight; j++) {
+      int tmp;
+      tmp = pred[j] + ref[j];
+      comp_pred[j] = (tmp + 1) >> 1;
+    }
+    comp_pred += weight;
+    pred += weight;
+    ref += ref_stride;
+  }
+}
+// #endif  // CONFIG_COMP_INTER_JOINT_SEARCH
  #endif  // VP9_ENCODER_VP9_VARIANCE_H_
diff --git a/vp9/encoder/vp9_variance_c.c b/vp9/encoder/vp9_variance_c.c

index c2a6004..fa53abd 100644 (file)
--- a/vp9/encoder/vp9_variance_c.c
+++ b/vp9/encoder/vp9_variance_c.c
@@ -13,6 +13,7 @@
  #include "vp9/common/vp9_filter.h"
  #include "vp9/common/vp9_subpelvar.h"
  #include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
  
  unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
    unsigned int i, sum = 0;
@@ -58,6 +59,29 @@ unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr,
    return vp9_variance64x32_c(temp2, 64, dst_ptr, dst_pixels_per_line, sse);
  }
  
+unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering
+  uint8_t temp2[68 * 64];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 33, 64, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 32, 64, vfilter);
+  comp_avg_pred(temp3, second_pred, 64, 32, temp2, 64);
+  return vp9_variance64x32_c(temp3, 64, dst_ptr, dst_pixels_per_line, sse);
+}
+
  unsigned int vp9_variance32x64_c(const uint8_t *src_ptr,
                                   int  source_stride,
                                   const uint8_t *ref_ptr,
@@ -92,6 +116,29 @@ unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr,
    return vp9_variance32x64_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
  }
  
+unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering
+  uint8_t temp2[68 * 64];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 64);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 65, 32, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 64, 32, vfilter);
+  comp_avg_pred(temp3, second_pred, 32, 64, temp2, 32);
+  return vp9_variance32x64_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+
  unsigned int vp9_variance32x16_c(const uint8_t *src_ptr,
                                   int  source_stride,
                                   const uint8_t *ref_ptr,
@@ -126,6 +173,29 @@ unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr,
    return vp9_variance32x16_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
  }
  
+unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering
+  uint8_t temp2[36 * 32];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 16);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 17, 32, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 16, 32, vfilter);
+  comp_avg_pred(temp3, second_pred, 32, 16, temp2, 32);
+  return vp9_variance32x16_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+
  unsigned int vp9_variance16x32_c(const uint8_t *src_ptr,
                                   int  source_stride,
                                   const uint8_t *ref_ptr,
@@ -160,6 +230,29 @@ unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr,
    return vp9_variance16x32_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
  }
  
+unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering
+  uint8_t temp2[36 * 32];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 32);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 33, 16, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 32, 16, vfilter);
+  comp_avg_pred(temp3, second_pred, 16, 32, temp2, 16);
+  return vp9_variance16x32_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
  unsigned int vp9_variance64x64_c(const uint8_t *src_ptr,
                                   int  source_stride,
                                   const uint8_t *ref_ptr,
@@ -317,6 +410,31 @@ unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr,
    return vp9_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
  }
  
+unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr,
+                                             int  src_pixels_per_line,
+                                             int  xoffset,
+                                             int  yoffset,
+                                             const uint8_t *dst_ptr,
+                                             int dst_pixels_per_line,
+                                             unsigned int *sse,
+                                             const uint8_t *second_pred) {
+  uint8_t temp2[20 * 16];
+  const int16_t *hfilter, *vfilter;
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 4);  // compound pred buffer
+  uint16_t fdata3[5 * 4];  // Temp data bufffer used in filtering
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  // First filter 1d Horizontal
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 5, 4, hfilter);
+
+  // Now filter Verticaly
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 4,  4,  4,  4, vfilter);
+  comp_avg_pred(temp3, second_pred, 4, 4, temp2, 4);
+  return vp9_variance4x4_c(temp3, 4, dst_ptr, dst_pixels_per_line, sse);
+}
  
  unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr,
                                           int  src_pixels_per_line,
@@ -339,6 +457,29 @@ unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr,
    return vp9_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
  }
  
+unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr,
+                                             int  src_pixels_per_line,
+                                             int  xoffset,
+                                             int  yoffset,
+                                             const uint8_t *dst_ptr,
+                                             int dst_pixels_per_line,
+                                             unsigned int *sse,
+                                             const uint8_t *second_pred) {
+  uint16_t fdata3[9 * 8];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 8);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 9, 8, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 8, 8, vfilter);
+  comp_avg_pred(temp3, second_pred, 8, 8, temp2, 8);
+  return vp9_variance8x8_c(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
+}
+
  unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr,
                                             int  src_pixels_per_line,
                                             int  xoffset,
@@ -360,6 +501,30 @@ unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr,
    return vp9_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
  }
  
+unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[17 * 16];
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 16);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 17, 16, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 16, 16, vfilter);
+
+  comp_avg_pred(temp3, second_pred, 16, 16, temp2, 16);
+  return vp9_variance16x16_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
  unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr,
                                             int  src_pixels_per_line,
                                             int  xoffset,
@@ -381,6 +546,29 @@ unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr,
    return vp9_variance64x64_c(temp2, 64, dst_ptr, dst_pixels_per_line, sse);
  }
  
+unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering
+  uint8_t temp2[68 * 64];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 65, 64, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 64, 64, vfilter);
+  comp_avg_pred(temp3, second_pred, 64, 64, temp2, 64);
+  return vp9_variance64x64_c(temp3, 64, dst_ptr, dst_pixels_per_line, sse);
+}
+
  unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr,
                                             int  src_pixels_per_line,
                                             int  xoffset,
@@ -402,6 +590,29 @@ unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr,
    return vp9_variance32x32_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
  }
  
+unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering
+  uint8_t temp2[36 * 32];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 32);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 33, 32, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 32, 32, vfilter);
+  comp_avg_pred(temp3, second_pred, 32, 32, temp2, 32);
+  return vp9_variance32x32_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+
  unsigned int vp9_variance_halfpixvar16x16_h_c(const uint8_t *src_ptr,
                                                int  source_stride,
                                                const uint8_t *ref_ptr,
@@ -543,6 +754,29 @@ unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr,
    return vp9_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
  }
  
+unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr,
+                                              int  src_pixels_per_line,
+                                              int  xoffset,
+                                              int  yoffset,
+                                              const uint8_t *dst_ptr,
+                                              int dst_pixels_per_line,
+                                              unsigned int *sse,
+                                              const uint8_t *second_pred) {
+  uint16_t fdata3[16 * 9];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 8);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 9, 16, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 8, 16, vfilter);
+  comp_avg_pred(temp3, second_pred, 16, 8, temp2, 16);
+  return vp9_variance16x8_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
  unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr,
                                            int  src_pixels_per_line,
                                            int  xoffset,
@@ -564,3 +798,25 @@ unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr,
    return vp9_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
  }
  
+unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr,
+                                              int  src_pixels_per_line,
+                                              int  xoffset,
+                                              int  yoffset,
+                                              const uint8_t *dst_ptr,
+                                              int dst_pixels_per_line,
+                                              unsigned int *sse,
+                                              const uint8_t *second_pred) {
+  uint16_t fdata3[9 * 16];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 16);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 17, 8, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 16, 8, vfilter);
+  comp_avg_pred(temp3, second_pred, 8, 16, temp2, 8);
+  return vp9_variance8x16_c(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
+}
author	Yunqing Wang <yunqingwang@google.com>
	Tue, 7 May 2013 16:45:28 +0000 (09:45 -0700)
committer	Yunqing Wang <yunqingwang@google.com>
	Fri, 10 May 2013 17:15:43 +0000 (10:15 -0700)
configure		patch \| blob \| history
vp9/common/vp9_enums.h		patch \| blob \| history
vp9/common/vp9_rtcd_defs.sh		patch \| blob \| history
vp9/encoder/vp9_mcomp.c		patch \| blob \| history
vp9/encoder/vp9_mcomp.h		patch \| blob \| history
vp9/encoder/vp9_onyx_if.c		patch \| blob \| history
vp9/encoder/vp9_rdopt.c		patch \| blob \| history
vp9/encoder/vp9_variance.h		patch \| blob \| history
vp9/encoder/vp9_variance_c.c		patch \| blob \| history