From 8e3d0e4d7db867caa110e96fa0fd1ff9ba37cb9f Mon Sep 17 00:00:00 2001
From: Jingning Han <jingning@google.com>
Date: Wed, 15 May 2013 12:19:59 -0700
Subject: [PATCH] Add building blocks for 4x8/8x4 rd search

These building blocks enable rate-distortion optimization search
over block sizes of 8x4 and 4x8. Need to convert them into mmx/sse
forms.

Change-Id: I570ea2d22d14ceec3fe3575128d7dfa172a577de
---
 vp9/common/vp9_blockd.h      |  2 +-
 vp9/common/vp9_rtcd_defs.sh  | 27 ++++++++++++
 vp9/encoder/vp9_onyx_if.c    | 12 ++++--
 vp9/encoder/vp9_rdopt.c      | 97 ++++++++++++++++++++++++--------------------
 vp9/encoder/vp9_sad_c.c      | 45 ++++++++++++++++++++
 vp9/encoder/vp9_variance_c.c | 88 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 221 insertions(+), 50 deletions(-)

diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index ad38730..b148b18 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -272,7 +272,7 @@ typedef struct {
 
 typedef struct {
   MB_MODE_INFO mbmi;
-  union b_mode_info bmi[4];
+  union b_mode_info bmi[16];
 } MODE_INFO;
 
 struct scale_factors {
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 48ce7db..b55cc74 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -367,6 +367,19 @@ vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt
 prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
 specialize vp9_sub_pixel_avg_variance8x8
 
+# TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form
+prototype unsigned int vp9_sub_pixel_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_sub_pixel_variance8x4
+
+prototype unsigned int vp9_sub_pixel_avg_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance8x4
+
+prototype unsigned int vp9_sub_pixel_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_sub_pixel_variance4x8
+
+prototype unsigned int vp9_sub_pixel_avg_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance4x8
+
 prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance4x4 sse2 mmx
 vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
@@ -404,6 +417,13 @@ specialize vp9_sad8x16 mmx sse2
 prototype unsigned int vp9_sad8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad8x8 mmx sse2
 
+# TODO(jingning): need to covert these functions into mmx/sse2 form
+prototype unsigned int vp9_sad8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
+specialize vp9_sad8x4
+
+prototype unsigned int vp9_sad4x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
+specialize vp9_sad4x8
+
 prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad4x4 mmx sse
 
@@ -509,6 +529,13 @@ specialize vp9_sad8x16x4d sse2
 prototype void vp9_sad8x8x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad8x8x4d sse2
 
+# TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form
+prototype void vp9_sad8x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"
+specialize vp9_sad8x4x4d
+
+prototype void vp9_sad4x8x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"
+specialize vp9_sad4x8x4d
+
 prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad4x4x4d sse
 prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 67d1b67..2d3fea9 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -1597,11 +1597,15 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
       vp9_sub_pixel_avg_variance8x8, NULL, NULL, NULL,
       vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
 
-  BFP(BLOCK_4X8, NULL, vp9_variance4x8, NULL,
-      NULL, NULL, NULL, NULL, NULL, NULL, NULL)
+  BFP(BLOCK_8X4, vp9_sad8x4, vp9_variance8x4, vp9_sub_pixel_variance8x4,
+      vp9_sub_pixel_avg_variance8x4, NULL, NULL,
+      NULL, NULL, NULL,
+      vp9_sad8x4x4d)
 
-  BFP(BLOCK_8X4, NULL, vp9_variance8x4, NULL,
-      NULL, NULL, NULL, NULL, NULL, NULL, NULL)
+  BFP(BLOCK_4X8, vp9_sad4x8, vp9_variance4x8, vp9_sub_pixel_variance4x8,
+      vp9_sub_pixel_avg_variance4x8, NULL, NULL,
+      NULL, NULL, NULL,
+      vp9_sad4x8x4d)
 
   BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,
       vp9_sub_pixel_avg_variance4x4, NULL, NULL, NULL,
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 5097664..f928e7a 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1096,6 +1096,50 @@ static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
   return r;
 }
 
+static enum BlockSize get_block_size(int bw, int bh) {
+  if (bw == 4 && bh == 4)
+    return BLOCK_4X4;
+
+  if (bw == 4 && bh == 8)
+    return BLOCK_4X8;
+
+  if (bw == 8 && bh == 4)
+    return BLOCK_8X4;
+
+  if (bw == 8 && bh == 8)
+    return BLOCK_8X8;
+
+  if (bw == 8 && bh == 16)
+    return BLOCK_8X16;
+
+  if (bw == 16 && bh == 8)
+    return BLOCK_16X8;
+
+  if (bw == 16 && bh == 16)
+    return BLOCK_16X16;
+
+  if (bw == 32 && bh == 32)
+    return BLOCK_32X32;
+
+  if (bw == 32 && bh == 16)
+    return BLOCK_32X16;
+
+  if (bw == 16 && bh == 32)
+    return BLOCK_16X32;
+
+  if (bw == 64 && bh == 32)
+    return BLOCK_64X32;
+
+  if (bw == 32 && bh == 64)
+    return BLOCK_32X64;
+
+  if (bw == 64 && bh == 64)
+    return BLOCK_64X64;
+
+  assert(0);
+  return -1;
+}
+
 static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
                                     BEST_SEG_INFO *bsi,
                                     int_mv seg_mvs[4][MAX_REF_FRAMES - 1]) {
@@ -1111,6 +1155,10 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
   int sbr = 0, sbd = 0;
   int segmentyrate = 0;
   int best_eobs[4] = { 0 };
+#if CONFIG_AB4X4
+  BLOCK_SIZE_TYPE bsize = mbmi->sb_type;
+  int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
+#endif
 
   vp9_variance_fn_ptr_t *v_fn_ptr;
 
@@ -1120,7 +1168,11 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
   vpx_memcpy(t_above, x->e_mbd.plane[0].above_context, sizeof(t_above));
   vpx_memcpy(t_left, x->e_mbd.plane[0].left_context, sizeof(t_left));
 
+#if CONFIG_AB4X4
+  v_fn_ptr = &cpi->fn_ptr[get_block_size(4 << bwl, 4 << bhl)];
+#else
   v_fn_ptr = &cpi->fn_ptr[BLOCK_4X4];
+#endif
 
   // 64 makes this threshold really big effectively
   // making it so that we very rarely check mvs on
@@ -1670,51 +1722,6 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
             frame_type, block_size);
 }
 
-
-static enum BlockSize get_block_size(int bw, int bh) {
-  if (bw == 4 && bh == 4)
-    return BLOCK_4X4;
-
-  if (bw == 4 && bh == 8)
-    return BLOCK_4X8;
-
-  if (bw == 8 && bh == 4)
-    return BLOCK_8X4;
-
-  if (bw == 8 && bh == 8)
-    return BLOCK_8X8;
-
-  if (bw == 8 && bh == 16)
-    return BLOCK_8X16;
-
-  if (bw == 16 && bh == 8)
-    return BLOCK_16X8;
-
-  if (bw == 16 && bh == 16)
-    return BLOCK_16X16;
-
-  if (bw == 32 && bh == 32)
-    return BLOCK_32X32;
-
-  if (bw == 32 && bh == 16)
-    return BLOCK_32X16;
-
-  if (bw == 16 && bh == 32)
-    return BLOCK_16X32;
-
-  if (bw == 64 && bh == 32)
-    return BLOCK_64X32;
-
-  if (bw == 32 && bh == 64)
-    return BLOCK_32X64;
-
-  if (bw == 64 && bh == 64)
-    return BLOCK_64X64;
-
-  assert(0);
-  return -1;
-}
-
 static void model_rd_from_var_lapndz(int var, int n, int qstep,
                                      int *rate, int *dist) {
   // This function models the rate and distortion for a Laplacian
diff --git a/vp9/encoder/vp9_sad_c.c b/vp9/encoder/vp9_sad_c.c
index b4cd193..994828f 100644
--- a/vp9/encoder/vp9_sad_c.c
+++ b/vp9/encoder/vp9_sad_c.c
@@ -156,6 +156,21 @@ unsigned int vp9_sad8x16_c(const uint8_t *src_ptr,
   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 16);
 }
 
+unsigned int vp9_sad8x4_c(const uint8_t *src_ptr,
+                          int src_stride,
+                          const uint8_t *ref_ptr,
+                          int ref_stride,
+                          unsigned int max_sad) {
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 4);
+}
+
+unsigned int vp9_sad4x8_c(const uint8_t *src_ptr,
+                          int src_stride,
+                          const uint8_t *ref_ptr,
+                          int ref_stride,
+                          unsigned int max_sad) {
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 8);
+}
 
 unsigned int vp9_sad4x4_c(const uint8_t *src_ptr,
                           int  src_stride,
@@ -563,6 +578,36 @@ void vp9_sad8x16x4d_c(const uint8_t *src_ptr,
                              ref_ptr[3], ref_stride, 0x7fffffff);
 }
 
+void vp9_sad8x4x4d_c(const uint8_t *src_ptr,
+                     int  src_stride,
+                     const uint8_t* const ref_ptr[],
+                     int  ref_stride,
+                     unsigned int *sad_array) {
+  sad_array[0] = vp9_sad8x4(src_ptr, src_stride,
+                            ref_ptr[0], ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad8x4(src_ptr, src_stride,
+                            ref_ptr[1], ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad8x4(src_ptr, src_stride,
+                            ref_ptr[2], ref_stride, 0x7fffffff);
+  sad_array[3] = vp9_sad8x4(src_ptr, src_stride,
+                            ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
+void vp9_sad4x8x4d_c(const uint8_t *src_ptr,
+                     int  src_stride,
+                     const uint8_t* const ref_ptr[],
+                     int  ref_stride,
+                     unsigned int *sad_array) {
+  sad_array[0] = vp9_sad4x8(src_ptr, src_stride,
+                            ref_ptr[0], ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad4x8(src_ptr, src_stride,
+                            ref_ptr[1], ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad4x8(src_ptr, src_stride,
+                            ref_ptr[2], ref_stride, 0x7fffffff);
+  sad_array[3] = vp9_sad4x8(src_ptr, src_stride,
+                            ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
 void vp9_sad4x4x4d_c(const uint8_t *src_ptr,
                      int  src_stride,
                      const uint8_t* const ref_ptr[],
diff --git a/vp9/encoder/vp9_variance_c.c b/vp9/encoder/vp9_variance_c.c
index fa53abd..e24a46b 100644
--- a/vp9/encoder/vp9_variance_c.c
+++ b/vp9/encoder/vp9_variance_c.c
@@ -820,3 +820,91 @@ unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr,
   comp_avg_pred(temp3, second_pred, 8, 16, temp2, 8);
   return vp9_variance8x16_c(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
 }
+
+unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr,
+                                         int  src_pixels_per_line,
+                                         int  xoffset,
+                                         int  yoffset,
+                                         const uint8_t *dst_ptr,
+                                         int dst_pixels_per_line,
+                                         unsigned int *sse) {
+  uint16_t fdata3[8 * 5];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 5, 8, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 4, 8, vfilter);
+
+  return vp9_variance8x4_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr,
+                                             int  src_pixels_per_line,
+                                             int  xoffset,
+                                             int  yoffset,
+                                             const uint8_t *dst_ptr,
+                                             int dst_pixels_per_line,
+                                             unsigned int *sse,
+                                             const uint8_t *second_pred) {
+  uint16_t fdata3[8 * 5];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 4);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 5, 8, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 4, 8, vfilter);
+  comp_avg_pred(temp3, second_pred, 8, 4, temp2, 8);
+  return vp9_variance8x4_c(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr,
+                                         int  src_pixels_per_line,
+                                         int  xoffset,
+                                         int  yoffset,
+                                         const uint8_t *dst_ptr,
+                                         int dst_pixels_per_line,
+                                         unsigned int *sse) {
+  uint16_t fdata3[5 * 8];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 17, 4, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 8, 4, vfilter);
+
+  return vp9_variance4x8_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr,
+                                             int  src_pixels_per_line,
+                                             int  xoffset,
+                                             int  yoffset,
+                                             const uint8_t *dst_ptr,
+                                             int dst_pixels_per_line,
+                                             unsigned int *sse,
+                                             const uint8_t *second_pred) {
+  uint16_t fdata3[5 * 8];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 8);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 17, 4, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 8, 4, vfilter);
+  comp_avg_pred(temp3, second_pred, 4, 8, temp2, 4);
+  return vp9_variance4x8_c(temp3, 4, dst_ptr, dst_pixels_per_line, sse);
+}
-- 
2.7.4