Make RD superblock mode search size-agnostic.
authorRonald S. Bultje <rbultje@google.com>
Wed, 10 Apr 2013 22:55:59 +0000 (15:55 -0700)
committerRonald S. Bultje <rbultje@google.com>
Wed, 10 Apr 2013 23:50:30 +0000 (16:50 -0700)
Merge various super_block_yrd and super_block_uvrd versions into one
common function that works for all sizes. Make transform size selection
size-agnostic also. This fixes a slight bug in the intra UV superblock
code where it used the wrong transform size for txsz > 8x8, and stores
the txsz selection for superblocks properly (instead of forgetting it).
Lastly, it removes the trellis search that was done for 16x16 intra
predictors, since trellis is relatively expensive and should thus only
be done after RD mode selection.

Gives basically identical results on derf (+0.009%).

Change-Id: If4485c6f0a0fe4038b3172f7a238477c35a6f8d3

vp9/common/vp9_rtcd_defs.sh
vp9/encoder/vp9_encodeframe.c
vp9/encoder/vp9_rdopt.c
vp9/encoder/vp9_rdopt.h
vp9/encoder/x86/vp9_encodeopt.asm
vp9/encoder/x86/vp9_x86_csystemdependent.c

index d98b947..a6a2af0 100644 (file)
@@ -583,9 +583,6 @@ specialize vp9_sub_pixel_mse32x32
 prototype unsigned int vp9_get_mb_ss "const int16_t *"
 specialize vp9_get_mb_ss mmx sse2
 # ENCODEMB INVOKE
-prototype int vp9_mbblock_error "struct macroblock *mb"
-specialize vp9_mbblock_error mmx sse2
-vp9_mbblock_error_sse2=vp9_mbblock_error_xmm
 
 prototype int vp9_block_error "int16_t *coeff, int16_t *dqcoeff, int block_size"
 specialize vp9_block_error mmx sse2
@@ -594,9 +591,6 @@ vp9_block_error_sse2=vp9_block_error_xmm
 prototype void vp9_subtract_b "struct block *be, struct blockd *bd, int pitch"
 specialize vp9_subtract_b mmx sse2
 
-prototype int vp9_mbuverror "struct macroblock *mb"
-specialize vp9_mbuverror
-
 prototype void vp9_subtract_b "struct block *be, struct blockd *bd, int pitch"
 specialize vp9_subtract_b mmx sse2
 
index 19bc168..c64b514 100644 (file)
@@ -840,15 +840,15 @@ static void pick_sb_modes(VP9_COMP *cpi,
   /* Find best coding mode & reconstruct the MB so it is available
    * as a predictor for MBs that follow in the SB */
   if (cm->frame_type == KEY_FRAME) {
-    vp9_rd_pick_intra_mode_sb32(cpi, x,
-                                totalrate,
-                                totaldist);
+    vp9_rd_pick_intra_mode_sb(cpi, x, totalrate, totaldist,
+                              BLOCK_SIZE_SB32X32);
 
     /* Save the coding context */
     vpx_memcpy(&x->sb32_context[xd->sb_index].mic, xd->mode_info_context,
                sizeof(MODE_INFO));
   } else {
-    vp9_rd_pick_inter_mode_sb32(cpi, x, mb_row, mb_col, totalrate, totaldist);
+    vp9_rd_pick_inter_mode_sb(cpi, x, mb_row, mb_col, totalrate, totaldist,
+                              BLOCK_SIZE_SB32X32);
   }
 }
 
@@ -870,12 +870,14 @@ static void pick_sb64_modes(VP9_COMP *cpi,
   /* Find best coding mode & reconstruct the MB so it is available
    * as a predictor for MBs that follow in the SB */
   if (cm->frame_type == KEY_FRAME) {
-    vp9_rd_pick_intra_mode_sb64(cpi, x, totalrate, totaldist);
+    vp9_rd_pick_intra_mode_sb(cpi, x, totalrate, totaldist,
+                              BLOCK_SIZE_SB64X64);
 
     /* Save the coding context */
     vpx_memcpy(&x->sb64_context.mic, xd->mode_info_context, sizeof(MODE_INFO));
   } else {
-    vp9_rd_pick_inter_mode_sb64(cpi, x, mb_row, mb_col, totalrate, totaldist);
+    vp9_rd_pick_inter_mode_sb(cpi, x, mb_row, mb_col, totalrate, totaldist,
+                              BLOCK_SIZE_SB64X64);
   }
 }
 
index 4df1170..82592f3 100644 (file)
@@ -347,42 +347,6 @@ int vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, int block_size) {
   return error;
 }
 
-int vp9_mbblock_error_c(MACROBLOCK *mb) {
-  MACROBLOCKD * const xd = &mb->e_mbd;
-  BLOCK  *be;
-  int i;
-  int error = 0;
-
-  for (i = 0; i < 16; i++) {
-    be = &mb->block[i];
-    error += vp9_block_error(be->coeff,
-                             BLOCK_OFFSET(xd->plane[0].dqcoeff, i, 16), 16);
-  }
-  return error;
-}
-
-int vp9_mbuverror_c(MACROBLOCK *mb) {
-  MACROBLOCKD * const xd = &mb->e_mbd;
-  BLOCK  *be;
-
-  int i, error = 0;
-
-  for (i = 16; i < 20; i++) {
-    be = &mb->block[i];
-    error += vp9_block_error(be->coeff,
-                             BLOCK_OFFSET(xd->plane[1].dqcoeff, i - 16, 16),
-                             16);
-  }
-  for (i = 20; i < 24; i++) {
-    be = &mb->block[i];
-    error += vp9_block_error(be->coeff,
-                             BLOCK_OFFSET(xd->plane[2].dqcoeff, i - 20, 16),
-                             16);
-  }
-
-  return error;
-}
-
 int vp9_uvsse(MACROBLOCK *x) {
   uint8_t *uptr, *vptr;
   uint8_t *upred_ptr = (*(x->block[16].base_src) + x->block[16].src);
@@ -635,109 +599,6 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
   return cost;
 }
 
-static int rdcost_mby_4x4(VP9_COMMON *const cm, MACROBLOCK *mb) {
-  int cost = 0;
-  int b;
-  MACROBLOCKD *xd = &mb->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *)&t_above;
-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *)&t_left;
-
-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
-  vpx_memcpy(&t_left, xd->left_context, sizeof(t_left));
-
-  for (b = 0; b < 16; b++)
-    cost += cost_coeffs(cm, mb, b, PLANE_TYPE_Y_WITH_DC,
-                        ta + vp9_block2above[TX_4X4][b],
-                        tl + vp9_block2left[TX_4X4][b],
-                        TX_4X4, 16);
-
-  return cost;
-}
-
-static void macro_block_yrd_4x4(VP9_COMMON *const cm,
-                                MACROBLOCK *mb,
-                                int *rate,
-                                int *distortion,
-                                int *skippable) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-
-  xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-  vp9_transform_mby_4x4(mb);
-  vp9_quantize_mby_4x4(mb);
-
-  *distortion = vp9_mbblock_error(mb) >> 2;
-  *rate = rdcost_mby_4x4(cm, mb);
-  *skippable = vp9_mby_is_skippable_4x4(xd);
-}
-
-static int rdcost_mby_8x8(VP9_COMMON *const cm, MACROBLOCK *mb) {
-  int cost = 0;
-  int b;
-  MACROBLOCKD *xd = &mb->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *)&t_above;
-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *)&t_left;
-
-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
-  vpx_memcpy(&t_left,  xd->left_context, sizeof(t_left));
-
-  for (b = 0; b < 16; b += 4)
-    cost += cost_coeffs(cm, mb, b, PLANE_TYPE_Y_WITH_DC,
-                        ta + vp9_block2above[TX_8X8][b],
-                        tl + vp9_block2left[TX_8X8][b],
-                        TX_8X8, 16);
-
-  return cost;
-}
-
-static void macro_block_yrd_8x8(VP9_COMMON *const cm,
-                                MACROBLOCK *mb,
-                                int *rate,
-                                int *distortion,
-                                int *skippable) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-
-  xd->mode_info_context->mbmi.txfm_size = TX_8X8;
-  vp9_transform_mby_8x8(mb);
-  vp9_quantize_mby_8x8(mb);
-
-  *distortion = vp9_mbblock_error(mb) >> 2;
-  *rate = rdcost_mby_8x8(cm, mb);
-  *skippable = vp9_mby_is_skippable_8x8(xd);
-}
-
-static int rdcost_mby_16x16(VP9_COMMON *const cm, MACROBLOCK *mb) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *)&t_above;
-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *)&t_left;
-
-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
-  vpx_memcpy(&t_left, xd->left_context, sizeof(t_left));
-
-  return cost_coeffs(cm, mb, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_16X16, 16);
-}
-
-static void macro_block_yrd_16x16(VP9_COMMON *const cm, MACROBLOCK *mb,
-                                  int *rate, int *distortion, int *skippable) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-
-  xd->mode_info_context->mbmi.txfm_size = TX_16X16;
-  vp9_transform_mby_16x16(mb);
-  vp9_quantize_mby_16x16(mb);
-  // TODO(jingning) is it possible to quickly determine whether to force
-  //                trailing coefficients to be zero, instead of running trellis
-  //                optimization in the rate-distortion optimization loop?
-  if (mb->optimize &&
-      xd->mode_info_context->mbmi.mode < I8X8_PRED)
-    vp9_optimize_mby_16x16(cm, mb);
-
-  *distortion = vp9_mbblock_error(mb) >> 2;
-  *rate = rdcost_mby_16x16(cm, mb);
-  *skippable = vp9_mby_is_skippable_16x16(xd);
-}
-
 static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
                                      int (*r)[2], int *rate,
                                      int *d, int *distortion,
@@ -823,24 +684,6 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
                                  rd[TX_4X4][1] : rd[TX_8X8][1];
 }
 
-static void macro_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
-                            int *distortion, int *skippable,
-                            int64_t txfm_cache[NB_TXFM_MODES]) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int r[TX_SIZE_MAX_MB][2], d[TX_SIZE_MAX_MB], s[TX_SIZE_MAX_MB];
-
-  vp9_subtract_mby(x->src_diff, *(x->block[0].base_src), xd->predictor,
-                   x->block[0].src_stride);
-
-  macro_block_yrd_16x16(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);
-  macro_block_yrd_8x8(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]);
-  macro_block_yrd_4x4(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]);
-
-  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skippable,
-                           txfm_cache, TX_16X16);
-}
-
 static void copy_predictor(uint8_t *dst, const uint8_t *predictor) {
   const unsigned int *p = (const unsigned int *)predictor;
   unsigned int *d = (unsigned int *)dst;
@@ -884,290 +727,191 @@ static int vp9_sb_uv_block_error_c(int16_t *coeff,
   return error > INT_MAX ? INT_MAX : (int)error;
 }
 
-static int rdcost_sby_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {
+static int rdcost_sby_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
+                          BLOCK_SIZE_TYPE bsize) {
+  const int bwl = mb_width_log2(bsize) + 2, bw = 1 << bwl;
+  const int bh = 1 << (mb_height_log2(bsize) + 2);
   int cost = 0, b;
   MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;
-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;
+  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
 
-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
-  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));
+  vpx_memcpy(&t_above, xd->above_context,
+             (sizeof(ENTROPY_CONTEXT_PLANES) * bw) >> 2);
+  vpx_memcpy(&t_left,  xd->left_context,
+             (sizeof(ENTROPY_CONTEXT_PLANES) * bh) >> 2);
 
-  for (b = 0; b < 64; b++)
+  for (b = 0; b < bw * bh; b++) {
+    const int x_idx = b & (bw - 1), y_idx = b >> bwl;
     cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,
-                        ta + vp9_block2above_sb[TX_4X4][b],
-                        tl + vp9_block2left_sb[TX_4X4][b], TX_4X4, 64);
+                ((ENTROPY_CONTEXT *) &t_above[x_idx >> 2]) + (x_idx & 3),
+                ((ENTROPY_CONTEXT *) &t_left[y_idx >> 2]) + (y_idx & 3),
+                TX_4X4, bw * bh);
+  }
 
   return cost;
 }
 
 static void super_block_yrd_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
-                                int *rate, int *distortion, int *skippable) {
+                                int *rate, int *distortion, int *skippable,
+                                BLOCK_SIZE_TYPE bsize) {
+  const int bwl = mb_width_log2(bsize) + 2, bhl = mb_height_log2(bsize) + 2;
   MACROBLOCKD *const xd = &x->e_mbd;
 
   xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-  vp9_transform_sby_4x4(x, BLOCK_SIZE_SB32X32);
-  vp9_quantize_sby_4x4(x, BLOCK_SIZE_SB32X32);
+  vp9_transform_sby_4x4(x, bsize);
+  vp9_quantize_sby_4x4(x, bsize);
 
-  *distortion = vp9_sb_block_error_c(x->coeff, xd->plane[0].dqcoeff, 1024, 2);
-  *rate       = rdcost_sby_4x4(cm, x);
-  *skippable  = vp9_sby_is_skippable(xd, BLOCK_SIZE_SB32X32, TX_4X4);
+  *distortion = vp9_sb_block_error_c(x->coeff, xd->plane[0].dqcoeff,
+                                     16 << (bwl + bhl), 2);
+  *rate       = rdcost_sby_4x4(cm, x, bsize);
+  *skippable  = vp9_sby_is_skippable(xd, bsize, TX_4X4);
 }
 
-static int rdcost_sby_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {
+static int rdcost_sby_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
+                          BLOCK_SIZE_TYPE bsize) {
+  const int bwl = mb_width_log2(bsize) + 1, bw = 1 << bwl;
+  const int bh = 1 << (mb_height_log2(bsize) + 1);
   int cost = 0, b;
   MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;
-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;
-
-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
-  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));
+  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
 
-  for (b = 0; b < 64; b += 4)
-    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,
-                        ta + vp9_block2above_sb[TX_8X8][b],
-                        tl + vp9_block2left_sb[TX_8X8][b], TX_8X8, 64);
+  vpx_memcpy(&t_above, xd->above_context,
+             (sizeof(ENTROPY_CONTEXT_PLANES) * bw) >> 1);
+  vpx_memcpy(&t_left,  xd->left_context,
+             (sizeof(ENTROPY_CONTEXT_PLANES) * bh) >> 1);
+
+  for (b = 0; b < bw * bh; b++) {
+    const int x_idx = b & (bw - 1), y_idx = b >> bwl;
+    cost += cost_coeffs(cm, x, b * 4, PLANE_TYPE_Y_WITH_DC,
+                ((ENTROPY_CONTEXT *) &t_above[x_idx >> 1]) + ((x_idx & 1) << 1),
+                ((ENTROPY_CONTEXT *) &t_left[y_idx >> 1]) + ((y_idx & 1) << 1),
+                TX_8X8, 4 * bw * bh);
+  }
 
   return cost;
 }
 
 static void super_block_yrd_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
-                                int *rate, int *distortion, int *skippable) {
+                                int *rate, int *distortion, int *skippable,
+                                BLOCK_SIZE_TYPE bsize) {
+  const int bwl = mb_width_log2(bsize) + 1, bhl = mb_height_log2(bsize) + 1;
   MACROBLOCKD *const xd = &x->e_mbd;
 
   xd->mode_info_context->mbmi.txfm_size = TX_8X8;
-  vp9_transform_sby_8x8(x, BLOCK_SIZE_SB32X32);
-  vp9_quantize_sby_8x8(x, BLOCK_SIZE_SB32X32);
+  vp9_transform_sby_8x8(x, bsize);
+  vp9_quantize_sby_8x8(x, bsize);
 
-  *distortion = vp9_sb_block_error_c(x->coeff, xd->plane[0].dqcoeff, 1024, 2);
-  *rate       = rdcost_sby_8x8(cm, x);
-  *skippable  = vp9_sby_is_skippable(xd, BLOCK_SIZE_SB32X32, TX_8X8);
+  *distortion = vp9_sb_block_error_c(x->coeff, xd->plane[0].dqcoeff,
+                                     64 << (bhl + bwl), 2);
+  *rate       = rdcost_sby_8x8(cm, x, bsize);
+  *skippable  = vp9_sby_is_skippable(xd, bsize, TX_8X8);
 }
 
-static int rdcost_sby_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {
+static int rdcost_sby_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
+                            BLOCK_SIZE_TYPE bsize) {
+  const int bwl = mb_width_log2(bsize), bw = 1 << bwl;
+  const int bh = 1 << mb_height_log2(bsize);
   int cost = 0, b;
   MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;
-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;
+  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
 
-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
-  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));
+  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES) * bw);
+  vpx_memcpy(&t_left,  xd->left_context,  sizeof(ENTROPY_CONTEXT_PLANES) * bh);
 
-  for (b = 0; b < 64; b += 16)
-    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,
-                        ta + vp9_block2above_sb[TX_16X16][b],
-                        tl + vp9_block2left_sb[TX_16X16][b], TX_16X16, 64);
+  for (b = 0; b < bw * bh; b++) {
+    const int x_idx = b & (bw - 1), y_idx = b >> bwl;
+    cost += cost_coeffs(cm, x, b * 16, PLANE_TYPE_Y_WITH_DC,
+                        (ENTROPY_CONTEXT *) &t_above[x_idx],
+                        (ENTROPY_CONTEXT *) &t_left[y_idx],
+                        TX_16X16, bw * bh * 16);
+  }
 
   return cost;
 }
 
 static void super_block_yrd_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                                  int *rate, int *distortion, int *skippable) {
+                                  int *rate, int *distortion, int *skippable,
+                                  BLOCK_SIZE_TYPE bsize) {
+  const int bwl = mb_width_log2(bsize), bhl = mb_height_log2(bsize);
   MACROBLOCKD *const xd = &x->e_mbd;
 
   xd->mode_info_context->mbmi.txfm_size = TX_16X16;
-  vp9_transform_sby_16x16(x, BLOCK_SIZE_SB32X32);
-  vp9_quantize_sby_16x16(x, BLOCK_SIZE_SB32X32);
+  vp9_transform_sby_16x16(x, bsize);
+  vp9_quantize_sby_16x16(x, bsize);
 
-  *distortion = vp9_sb_block_error_c(x->coeff, xd->plane[0].dqcoeff, 1024, 2);
-  *rate       = rdcost_sby_16x16(cm, x);
-  *skippable  = vp9_sby_is_skippable(xd, BLOCK_SIZE_SB32X32, TX_16X16);
+  *distortion = vp9_sb_block_error_c(x->coeff, xd->plane[0].dqcoeff,
+                                     256 << (bwl + bhl), 2);
+  *rate       = rdcost_sby_16x16(cm, x, bsize);
+  *skippable  = vp9_sby_is_skippable(xd, bsize, TX_16X16);
 }
 
-static int rdcost_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x) {
+static int rdcost_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
+                            BLOCK_SIZE_TYPE bsize) {
+  const int bwl = mb_width_log2(bsize) - 1, bw = 1 << bwl;
+  const int bh = 1 << (mb_height_log2(bsize) - 1);
+  int cost = 0, b;
   MACROBLOCKD * const xd = &x->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;
-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;
+  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
 
-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
-  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));
+  vpx_memcpy(&t_above, xd->above_context,
+             sizeof(ENTROPY_CONTEXT_PLANES) * bw * 2);
+  vpx_memcpy(&t_left,  xd->left_context,
+             sizeof(ENTROPY_CONTEXT_PLANES) * bh * 2);
+
+  for (b = 0; b < bw * bh; b++) {
+    const int x_idx = b & (bw - 1), y_idx = b >> bwl;
+    cost += cost_coeffs(cm, x, b * 64, PLANE_TYPE_Y_WITH_DC,
+                        (ENTROPY_CONTEXT *) &t_above[x_idx * 2],
+                        (ENTROPY_CONTEXT *) &t_left[y_idx * 2],
+                        TX_32X32, bw * bh * 64);
+  }
 
-  return cost_coeffs(cm, x, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_32X32, 64);
+  return cost;
 }
 
 static void super_block_yrd_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
-                                  int *rate, int *distortion, int *skippable) {
+                                  int *rate, int *distortion, int *skippable,
+                                  BLOCK_SIZE_TYPE bsize) {
+  const int bwl = mb_width_log2(bsize) - 1, bhl = mb_height_log2(bsize) - 1;
   MACROBLOCKD *const xd = &x->e_mbd;
 
   xd->mode_info_context->mbmi.txfm_size = TX_32X32;
-  vp9_transform_sby_32x32(x, BLOCK_SIZE_SB32X32);
-  vp9_quantize_sby_32x32(x, BLOCK_SIZE_SB32X32);
+  vp9_transform_sby_32x32(x, bsize);
+  vp9_quantize_sby_32x32(x, bsize);
 
-  *distortion = vp9_sb_block_error_c(x->coeff, xd->plane[0].dqcoeff, 1024, 0);
-  *rate       = rdcost_sby_32x32(cm, x);
-  *skippable  = vp9_sby_is_skippable(xd, BLOCK_SIZE_SB32X32, TX_32X32);
+  *distortion = vp9_sb_block_error_c(x->coeff, xd->plane[0].dqcoeff,
+                                     1024 << (bwl + bhl), 0);
+  *rate       = rdcost_sby_32x32(cm, x, bsize);
+  *skippable  = vp9_sby_is_skippable(xd, bsize, TX_32X32);
 }
 
 static void super_block_yrd(VP9_COMP *cpi,
                             MACROBLOCK *x, int *rate, int *distortion,
-                            int *skip,
+                            int *skip, BLOCK_SIZE_TYPE bs,
                             int64_t txfm_cache[NB_TXFM_MODES]) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB];
-  const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
+  uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
   int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
 
-  vp9_subtract_sby_s_c(x->src_diff, src, src_y_stride, dst, dst_y_stride,
-                       BLOCK_SIZE_SB32X32);
-  super_block_yrd_32x32(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);
-  super_block_yrd_16x16(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);
-  super_block_yrd_8x8(cm, x,   &r[TX_8X8][0],   &d[TX_8X8],   &s[TX_8X8]);
-  super_block_yrd_4x4(cm, x,   &r[TX_4X4][0],   &d[TX_4X4],   &s[TX_4X4]);
-
-  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache,
-                           TX_SIZE_MAX_SB - 1);
-}
-
-static int rdcost_sb64y_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {
-  int cost = 0, b;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;
-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;
-
-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
-  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));
-
-  for (b = 0; b < 256; b++)
-    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,
-                        ta + vp9_block2above_sb64[TX_4X4][b],
-                        tl + vp9_block2left_sb64[TX_4X4][b], TX_4X4, 256);
-
-  return cost;
-}
-
-static void super_block64_yrd_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
-                                  int *rate, int *distortion, int *skippable) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-  vp9_transform_sby_4x4(x, BLOCK_SIZE_SB64X64);
-  vp9_quantize_sby_4x4(x, BLOCK_SIZE_SB64X64);
-
-  *distortion = vp9_sb_block_error_c(x->coeff, xd->plane[0].dqcoeff, 4096, 2);
-  *rate       = rdcost_sb64y_4x4(cm, x);
-  *skippable  = vp9_sby_is_skippable(xd, BLOCK_SIZE_SB64X64, TX_4X4);
-}
-
-static int rdcost_sb64y_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {
-  int cost = 0, b;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;
-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;
-
-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
-  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));
-
-  for (b = 0; b < 256; b += 4)
-    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,
-                        ta + vp9_block2above_sb64[TX_8X8][b],
-                        tl + vp9_block2left_sb64[TX_8X8][b], TX_8X8, 256);
-
-  return cost;
-}
-
-static void super_block64_yrd_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
-                                  int *rate, int *distortion, int *skippable) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  xd->mode_info_context->mbmi.txfm_size = TX_8X8;
-  vp9_transform_sby_8x8(x, BLOCK_SIZE_SB64X64);
-  vp9_quantize_sby_8x8(x, BLOCK_SIZE_SB64X64);
-
-  *distortion = vp9_sb_block_error_c(x->coeff, xd->plane[0].dqcoeff, 4096, 2);
-  *rate       = rdcost_sb64y_8x8(cm, x);
-  *skippable  = vp9_sby_is_skippable(xd, BLOCK_SIZE_SB64X64, TX_8X8);
-}
-
-static int rdcost_sb64y_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {
-  int cost = 0, b;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;
-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;
-
-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
-  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));
-
-  for (b = 0; b < 256; b += 16)
-    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,
-                        ta + vp9_block2above_sb64[TX_16X16][b],
-                        tl + vp9_block2left_sb64[TX_16X16][b], TX_16X16, 256);
-
-  return cost;
-}
-
-static void super_block64_yrd_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                                    int *rate, int *distortion,
-                                    int *skippable) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  xd->mode_info_context->mbmi.txfm_size = TX_16X16;
-  vp9_transform_sby_16x16(x, BLOCK_SIZE_SB64X64);
-  vp9_quantize_sby_16x16(x, BLOCK_SIZE_SB64X64);
-
-  *distortion = vp9_sb_block_error_c(x->coeff, xd->plane[0].dqcoeff, 4096, 2);
-  *rate       = rdcost_sb64y_16x16(cm, x);
-  *skippable  = vp9_sby_is_skippable(xd, BLOCK_SIZE_SB64X64, TX_16X16);
-}
-
-static int rdcost_sb64y_32x32(VP9_COMMON *const cm, MACROBLOCK *x) {
-  int cost = 0, b;
-  MACROBLOCKD * const xd = &x->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;
-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;
-
-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
-  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));
-
-  for (b = 0; b < 256; b += 64)
-    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,
-                        ta + vp9_block2above_sb64[TX_32X32][b],
-                        tl + vp9_block2left_sb64[TX_32X32][b], TX_32X32, 256);
-
-  return cost;
-}
-
-static void super_block64_yrd_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
-                                    int *rate, int *distortion,
-                                    int *skippable) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  xd->mode_info_context->mbmi.txfm_size = TX_32X32;
-  vp9_transform_sby_32x32(x, BLOCK_SIZE_SB64X64);
-  vp9_quantize_sby_32x32(x, BLOCK_SIZE_SB64X64);
-
-  *distortion = vp9_sb_block_error_c(x->coeff, xd->plane[0].dqcoeff, 4096, 0);
-  *rate       = rdcost_sb64y_32x32(cm, x);
-  *skippable  = vp9_sby_is_skippable(xd, BLOCK_SIZE_SB64X64, TX_32X32);
-}
-
-static void super_block_64_yrd(VP9_COMP *cpi,
-                               MACROBLOCK *x, int *rate, int *distortion,
-                               int *skip,
-                               int64_t txfm_cache[NB_TXFM_MODES]) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB];
-  const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
-  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
+  // FIXME(rbultje): mb code still predicts into xd->predictor
+  if (bs == BLOCK_SIZE_MB16X16) {
+    vp9_subtract_mby(x->src_diff, src, xd->predictor, src_y_stride);
+  } else {
+    vp9_subtract_sby_s_c(x->src_diff, src, src_y_stride, dst, dst_y_stride,
+                         bs);
+  }
 
-  vp9_subtract_sby_s_c(x->src_diff, src, src_y_stride, dst, dst_y_stride,
-                       BLOCK_SIZE_SB64X64);
-  super_block64_yrd_32x32(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);
-  super_block64_yrd_16x16(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);
-  super_block64_yrd_8x8(cm, x,   &r[TX_8X8][0],   &d[TX_8X8],   &s[TX_8X8]);
-  super_block64_yrd_4x4(cm, x,   &r[TX_4X4][0],   &d[TX_4X4],   &s[TX_4X4]);
+  if (bs >= BLOCK_SIZE_SB32X32)
+    super_block_yrd_32x32(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],
+                          bs);
+  super_block_yrd_16x16(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16], bs);
+  super_block_yrd_8x8(cm, x,   &r[TX_8X8][0],   &d[TX_8X8],   &s[TX_8X8],   bs);
+  super_block_yrd_4x4(cm, x,   &r[TX_4X4][0],   &d[TX_4X4],   &s[TX_4X4],   bs);
 
   choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache,
-                           TX_SIZE_MAX_SB - 1);
+                           TX_32X32 - (bs < BLOCK_SIZE_SB32X32));
 }
 
 static void copy_predictor_8x8(uint8_t *dst, const uint8_t *predictor) {
@@ -1365,149 +1109,66 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
   return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
 }
 
-static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi,
-                                      MACROBLOCK *x,
-                                      int *rate,
-                                      int *rate_tokenonly,
-                                      int *distortion,
-                                      int *skippable,
+static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                                      int *rate, int *rate_tokenonly,
+                                      int *distortion, int *skippable,
+                                      BLOCK_SIZE_TYPE bsize,
                                       int64_t txfm_cache[NB_TXFM_MODES]) {
   MB_PREDICTION_MODE mode;
   MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
   int this_rate, this_rate_tokenonly;
   int this_distortion, s;
   int64_t best_rd = INT64_MAX, this_rd;
+  TX_SIZE UNINITIALIZED_IS_SAFE(best_tx);
+  int i;
 
-  /* Y Search for 32x32 intra prediction mode */
-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-    x->e_mbd.mode_info_context->mbmi.mode = mode;
-    vp9_build_intra_predictors_sby_s(&x->e_mbd);
-
-    super_block_yrd(cpi, x, &this_rate_tokenonly,
-                    &this_distortion, &s, txfm_cache);
-    this_rate = this_rate_tokenonly +
-                x->mbmode_cost[x->e_mbd.frame_type]
-                              [x->e_mbd.mode_info_context->mbmi.mode];
-    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
-
-    if (this_rd < best_rd) {
-      mode_selected   = mode;
-      best_rd         = this_rd;
-      *rate           = this_rate;
-      *rate_tokenonly = this_rate_tokenonly;
-      *distortion     = this_distortion;
-      *skippable      = s;
-    }
-  }
-
-  x->e_mbd.mode_info_context->mbmi.mode = mode_selected;
-
-  return best_rd;
-}
-
-static int64_t rd_pick_intra_sb64y_mode(VP9_COMP *cpi,
-                                        MACROBLOCK *x,
-                                        int *rate,
-                                        int *rate_tokenonly,
-                                        int *distortion,
-                                        int *skippable,
-                                        int64_t txfm_cache[NB_TXFM_MODES]) {
-  MB_PREDICTION_MODE mode;
-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
-  int this_rate, this_rate_tokenonly;
-  int this_distortion, s;
-  int64_t best_rd = INT64_MAX, this_rd;
+  for (i = 0; i < NB_TXFM_MODES; i++)
+    txfm_cache[i] = INT64_MAX;
 
   /* Y Search for 32x32 intra prediction mode */
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+    int64_t local_txfm_cache[NB_TXFM_MODES];
+
     x->e_mbd.mode_info_context->mbmi.mode = mode;
-    vp9_build_intra_predictors_sb64y_s(&x->e_mbd);
+    if (bsize == BLOCK_SIZE_MB16X16) {
+      vp9_build_intra_predictors_mby(&x->e_mbd);
+    } else if (bsize == BLOCK_SIZE_SB32X32) {
+      vp9_build_intra_predictors_sby_s(&x->e_mbd);
+    } else {
+      assert(bsize == BLOCK_SIZE_SB64X64);
+      vp9_build_intra_predictors_sb64y_s(&x->e_mbd);
+    }
 
-    super_block_64_yrd(cpi, x, &this_rate_tokenonly,
-                       &this_distortion, &s, txfm_cache);
-    this_rate = this_rate_tokenonly +
-                x->mbmode_cost[x->e_mbd.frame_type]
-                              [x->e_mbd.mode_info_context->mbmi.mode];
+    super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s,
+                    bsize, local_txfm_cache);
+    this_rate = this_rate_tokenonly + x->mbmode_cost[x->e_mbd.frame_type][mode];
     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
 
     if (this_rd < best_rd) {
       mode_selected   = mode;
       best_rd         = this_rd;
+      best_tx         = x->e_mbd.mode_info_context->mbmi.txfm_size;
       *rate           = this_rate;
       *rate_tokenonly = this_rate_tokenonly;
       *distortion     = this_distortion;
       *skippable      = s;
     }
-  }
-
-  x->e_mbd.mode_info_context->mbmi.mode = mode_selected;
-
-  return best_rd;
-}
-
-static int64_t rd_pick_intra16x16mby_mode(VP9_COMP *cpi,
-                                          MACROBLOCK *x,
-                                          int *Rate,
-                                          int *rate_y,
-                                          int *Distortion,
-                                          int *skippable,
-                                          int64_t txfm_cache[NB_TXFM_MODES]) {
-  MB_PREDICTION_MODE mode;
-  TX_SIZE txfm_size = 0;
-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-  int rate, ratey;
-  int distortion, skip;
-  int64_t best_rd = INT64_MAX;
-  int64_t this_rd;
-
-  int i;
-  for (i = 0; i < NB_TXFM_MODES; i++)
-    txfm_cache[i] = INT64_MAX;
-
-  // Y Search for 16x16 intra prediction mode
-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-    int64_t local_txfm_cache[NB_TXFM_MODES];
-
-    mbmi->mode = mode;
-
-    vp9_build_intra_predictors_mby(xd);
-
-    macro_block_yrd(cpi, x, &ratey, &distortion, &skip, local_txfm_cache);
-
-    // FIXME add compoundmode cost
-    // FIXME add rate for mode2
-    rate = ratey + x->mbmode_cost[xd->frame_type][mbmi->mode];
-
-    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
-
-    if (this_rd < best_rd) {
-      mode_selected = mode;
-      txfm_size = mbmi->txfm_size;
-      best_rd = this_rd;
-      *Rate = rate;
-      *rate_y = ratey;
-      *Distortion = distortion;
-      *skippable = skip;
-    }
 
     for (i = 0; i < NB_TXFM_MODES; i++) {
       int64_t adj_rd = this_rd + local_txfm_cache[i] -
-                        local_txfm_cache[cpi->common.txfm_mode];
+                       local_txfm_cache[cpi->common.txfm_mode];
       if (adj_rd < txfm_cache[i]) {
         txfm_cache[i] = adj_rd;
       }
     }
   }
 
-  mbmi->txfm_size = txfm_size;
-  mbmi->mode = mode_selected;
+  x->e_mbd.mode_info_context->mbmi.mode = mode_selected;
+  x->e_mbd.mode_info_context->mbmi.txfm_size = best_tx;
 
   return best_rd;
 }
 
-
 static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
                                      B_PREDICTION_MODE *best_mode,
                                      int *mode_costs,
@@ -1774,497 +1435,222 @@ static int64_t rd_pick_intra8x8mby_modes_and_txsz(VP9_COMP *cpi, MACROBLOCK *x,
   return tmp_rd;
 }
 
-static int rd_cost_mbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *mb, int backup) {
-  int b;
-  int cost = 0;
-  MACROBLOCKD *xd = &mb->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta, *tl;
-
-  if (backup) {
-    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
-    ta = (ENTROPY_CONTEXT *)&t_above;
-    tl = (ENTROPY_CONTEXT *)&t_left;
-  } else {
-    ta = (ENTROPY_CONTEXT *)xd->above_context;
-    tl = (ENTROPY_CONTEXT *)xd->left_context;
-  }
-
-  for (b = 16; b < 24; b++)
-    cost += cost_coeffs(cm, mb, b, PLANE_TYPE_UV,
-                        ta + vp9_block2above[TX_4X4][b],
-                        tl + vp9_block2left[TX_4X4][b],
-                        TX_4X4, 16);
-
-  return cost;
-}
-
-
-static int64_t rd_inter16x16_uv_4x4(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
-                                    int *distortion, int *skip,
-                                    int do_ctx_backup) {
-  vp9_transform_mbuv_4x4(x);
-  vp9_quantize_mbuv_4x4(x);
-
-  *rate       = rd_cost_mbuv_4x4(&cpi->common, x, do_ctx_backup);
-  *distortion = vp9_mbuverror(x) / 4;
-  *skip       = vp9_mbuv_is_skippable_4x4(&x->e_mbd);
-
-  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
-}
-
-static int rd_cost_mbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *mb, int backup) {
-  int b;
-  int cost = 0;
-  MACROBLOCKD *xd = &mb->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta, *tl;
-
-  if (backup) {
-    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+#define UVCTX(c, p) ((p) ? (c).v : (c).u)
+static int rd_cost_sbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
+                            BLOCK_SIZE_TYPE bsize) {
+  const int bwl = mb_width_log2(bsize) + 1, bw = 1 << bwl;
+  const int bh = 1 << (mb_height_log2(bsize) + 1);
+  int yoff = 4 * bw * bh;
+  int p, b, cost = 0;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
 
-    ta = (ENTROPY_CONTEXT *)&t_above;
-    tl = (ENTROPY_CONTEXT *)&t_left;
-  } else {
-    ta = (ENTROPY_CONTEXT *)mb->e_mbd.above_context;
-    tl = (ENTROPY_CONTEXT *)mb->e_mbd.left_context;
+  vpx_memcpy(&t_above, xd->above_context,
+             (sizeof(ENTROPY_CONTEXT_PLANES) * bw) >> 1);
+  vpx_memcpy(&t_left, xd->left_context,
+             (sizeof(ENTROPY_CONTEXT_PLANES) * bh) >> 1);
+
+  for (p = 0; p < 2; p++) {
+    for (b = 0; b < bw * bh; b++) {
+      const int x_idx = b & (bw - 1), y_idx = b >> bwl;
+      cost += cost_coeffs(cm, x, yoff + b, PLANE_TYPE_UV,
+                          UVCTX(t_above[x_idx >> 1], p) + (x_idx & 1),
+                          UVCTX(t_left[y_idx >> 1], p) + (y_idx & 1),
+                          TX_4X4, bw * bh * 4);
+    }
+    yoff = (yoff * 5) >> 2;  // u -> v
   }
 
-  for (b = 16; b < 24; b += 4)
-    cost += cost_coeffs(cm, mb, b, PLANE_TYPE_UV,
-                        ta + vp9_block2above[TX_8X8][b],
-                        tl + vp9_block2left[TX_8X8][b], TX_8X8, 16);
-
   return cost;
 }
 
-static int64_t rd_inter16x16_uv_8x8(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
-                                    int *distortion, int *skip,
-                                    int do_ctx_backup) {
-  vp9_transform_mbuv_8x8(x);
-  vp9_quantize_mbuv_8x8(x);
+static void super_block_uvrd_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
+                                 int *rate, int *distortion, int *skip,
+                                 BLOCK_SIZE_TYPE bsize) {
+  const int bwl = mb_width_log2(bsize) + 2, bhl = mb_height_log2(bsize) + 2;
+  MACROBLOCKD *const xd = &x->e_mbd;
 
-  *rate       = rd_cost_mbuv_8x8(&cpi->common, x, do_ctx_backup);
-  *distortion = vp9_mbuverror(x) / 4;
-  *skip       = vp9_mbuv_is_skippable_8x8(&x->e_mbd);
+  vp9_transform_sbuv_4x4(x, bsize);
+  vp9_quantize_sbuv_4x4(x, bsize);
 
-  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+  *rate       = rd_cost_sbuv_4x4(cm, x, bsize);
+  *distortion = vp9_sb_uv_block_error_c(x->coeff + (16 << (bwl + bhl)),
+                                        xd->plane[1].dqcoeff,
+                                        xd->plane[2].dqcoeff,
+                                        32 << (bwl + bhl - 2), 2);
+  *skip       = vp9_sbuv_is_skippable(xd, bsize, TX_4X4);
 }
 
-static int rd_cost_sbuv_16x16(VP9_COMMON *const cm, MACROBLOCK *x, int backup) {
-  int b;
-  int cost = 0;
+static int rd_cost_sbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
+                            BLOCK_SIZE_TYPE bsize) {
+  const int bwl = mb_width_log2(bsize), bw = 1 << bwl;
+  const int bh = 1 << mb_height_log2(bsize);
+  int yoff = 16 * bw * bh;
+  int p, b, cost = 0;
   MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
-  ENTROPY_CONTEXT *ta, *tl;
-
-  if (backup) {
-    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES) * 2);
-    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES) * 2);
+  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
 
-    ta = (ENTROPY_CONTEXT *) &t_above;
-    tl = (ENTROPY_CONTEXT *) &t_left;
-  } else {
-    ta = (ENTROPY_CONTEXT *)xd->above_context;
-    tl = (ENTROPY_CONTEXT *)xd->left_context;
+  vpx_memcpy(&t_above, xd->above_context,
+             sizeof(ENTROPY_CONTEXT_PLANES) * bw);
+  vpx_memcpy(&t_left, xd->left_context,
+             sizeof(ENTROPY_CONTEXT_PLANES) * bh);
+
+  for (p = 0; p < 2; p++) {
+    for (b = 0; b < bw * bh; b++) {
+      const int x_idx = b & (bw - 1), y_idx = b >> bwl;
+      cost += cost_coeffs(cm, x, yoff + b * 4, PLANE_TYPE_UV,
+                          UVCTX(t_above[x_idx], p),
+                          UVCTX(t_left[y_idx], p),
+                          TX_8X8, bw * bh * 16);
+    }
+    yoff = (yoff * 5) >> 2;  // u -> v
   }
 
-  for (b = 16; b < 24; b += 4)
-    cost += cost_coeffs(cm, x, b * 4, PLANE_TYPE_UV,
-                        ta + vp9_block2above[TX_8X8][b],
-                        tl + vp9_block2left[TX_8X8][b], TX_16X16, 64);
-
   return cost;
 }
 
-static void rd_inter32x32_uv_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                                   int *rate, int *distortion, int *skip,
-                                   int backup) {
+static void super_block_uvrd_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
+                                 int *rate, int *distortion, int *skip,
+                                 BLOCK_SIZE_TYPE bsize) {
+  const int bwl = mb_width_log2(bsize) + 1, bhl = mb_height_log2(bsize) + 1;
   MACROBLOCKD *const xd = &x->e_mbd;
 
-  vp9_transform_sbuv_16x16(x, BLOCK_SIZE_SB32X32);
-  vp9_quantize_sbuv_16x16(x, BLOCK_SIZE_SB32X32);
+  vp9_transform_sbuv_8x8(x, bsize);
+  vp9_quantize_sbuv_8x8(x, bsize);
 
-  *rate       = rd_cost_sbuv_16x16(cm, x, backup);
-  *distortion = vp9_sb_uv_block_error_c(x->coeff + 1024,
+  *rate       = rd_cost_sbuv_8x8(cm, x, bsize);
+  *distortion = vp9_sb_uv_block_error_c(x->coeff + (64 << (bwl + bhl)),
                                         xd->plane[1].dqcoeff,
-                                        xd->plane[2].dqcoeff, 512, 2);
-  *skip       = vp9_sbuv_is_skippable(xd, BLOCK_SIZE_SB32X32, TX_16X16);
-}
-
-static int64_t rd_inter32x32_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
-                                int *distortion, int *skip) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
-  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
-  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
-
-  if (mbmi->txfm_size >= TX_16X16) {
-    vp9_subtract_sbuv_s_c(x->src_diff,
-                          usrc, vsrc, src_uv_stride,
-                          udst, vdst, dst_uv_stride,
-                          BLOCK_SIZE_SB32X32);
-    rd_inter32x32_uv_16x16(&cpi->common, x, rate, distortion, skip, 1);
-  } else {
-    int n, r = 0, d = 0;
-    int skippable = 1;
-    ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
-    ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
-    ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
-
-    memcpy(t_above, xd->above_context, sizeof(t_above));
-    memcpy(t_left, xd->left_context, sizeof(t_left));
-
-    for (n = 0; n < 4; n++) {
-      int x_idx = n & 1, y_idx = n >> 1;
-      int d_tmp, s_tmp, r_tmp;
-
-      xd->above_context = ta + x_idx;
-      xd->left_context = tl + y_idx;
-      vp9_subtract_mbuv_s_c(x->src_diff,
-                            usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
-                            vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
-                            src_uv_stride,
-                            udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                            vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                            dst_uv_stride);
-
-      if (mbmi->txfm_size == TX_4X4) {
-        rd_inter16x16_uv_4x4(cpi, x, &r_tmp, &d_tmp, &s_tmp, 0);
-      } else {
-        rd_inter16x16_uv_8x8(cpi, x, &r_tmp, &d_tmp, &s_tmp, 0);
-      }
-
-      r += r_tmp;
-      d += d_tmp;
-      skippable = skippable && s_tmp;
-    }
-
-    *rate = r;
-    *distortion = d;
-    *skip = skippable;
-    xd->left_context = tl;
-    xd->above_context = ta;
-    memcpy(xd->above_context, t_above, sizeof(t_above));
-    memcpy(xd->left_context, t_left, sizeof(t_left));
-  }
-
-  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
-}
-
-static void super_block_64_uvrd(VP9_COMMON *const cm, MACROBLOCK *x, int *rate,
-                                int *distortion, int *skip);
-static int64_t rd_inter64x64_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
-                                int *distortion, int *skip) {
-  super_block_64_uvrd(&cpi->common, x, rate, distortion, skip);
-  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+                                        xd->plane[2].dqcoeff,
+                                        128 << (bwl + bhl - 2), 2);
+  *skip       = vp9_sbuv_is_skippable(xd, bsize, TX_8X8);
 }
 
-static void rd_pick_intra_mbuv_mode(VP9_COMP *cpi,
-                                    MACROBLOCK *x,
-                                    int *rate,
-                                    int *rate_tokenonly,
-                                    int *distortion,
-                                    int *skippable) {
-  MB_PREDICTION_MODE mode;
-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-  int64_t best_rd = INT64_MAX;
-  int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);
-  int rate_to, UNINITIALIZED_IS_SAFE(skip);
-
-  xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-    int rate;
-    int distortion;
-    int64_t this_rd;
-
-    mbmi->uv_mode = mode;
-    vp9_build_intra_predictors_mbuv(&x->e_mbd);
-
-    vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
-                      x->e_mbd.predictor, x->src.uv_stride);
-    vp9_transform_mbuv_4x4(x);
-    vp9_quantize_mbuv_4x4(x);
-
-    rate_to = rd_cost_mbuv_4x4(&cpi->common, x, 1);
-    rate = rate_to
-           + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];
-
-    distortion = vp9_mbuverror(x) / 4;
-
-    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+static int rd_cost_sbuv_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
+                              BLOCK_SIZE_TYPE bsize) {
+  const int bwl = mb_width_log2(bsize) - 1, bw = 1 << bwl;
+  const int bh = 1 << (mb_height_log2(bsize) - 1);
+  int yoff = 64 * bw * bh;
+  int p, b, cost = 0;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
 
-    if (this_rd < best_rd) {
-      skip = vp9_mbuv_is_skippable_4x4(xd);
-      best_rd = this_rd;
-      d = distortion;
-      r = rate;
-      *rate_tokenonly = rate_to;
-      mode_selected = mode;
+  vpx_memcpy(&t_above, xd->above_context,
+             sizeof(ENTROPY_CONTEXT_PLANES) * 2 * bw);
+  vpx_memcpy(&t_left, xd->left_context,
+             sizeof(ENTROPY_CONTEXT_PLANES) * 2 * bh);
+
+  for (p = 0; p < 2; p++) {
+    for (b = 0; b < bw * bh; b++) {
+      const int x_idx = b & (bw - 1), y_idx = b >> bwl;
+      cost += cost_coeffs(cm, x, yoff + b * 16, PLANE_TYPE_UV,
+                          UVCTX(t_above[x_idx * 2], p),
+                          UVCTX(t_left[y_idx * 2], p),
+                          TX_16X16, bw * bh * 64);
     }
+    yoff = (yoff * 5) >> 2;  // u -> v
   }
 
-  *rate = r;
-  *distortion = d;
-  *skippable = skip;
-
-  mbmi->uv_mode = mode_selected;
-}
-
-static void rd_pick_intra_mbuv_mode_8x8(VP9_COMP *cpi,
-                                        MACROBLOCK *x,
-                                        int *rate,
-                                        int *rate_tokenonly,
-                                        int *distortion,
-                                        int *skippable) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_PREDICTION_MODE mode;
-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-  int64_t best_rd = INT64_MAX;
-  int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);
-  int rate_to, UNINITIALIZED_IS_SAFE(skip);
-
-  xd->mode_info_context->mbmi.txfm_size = TX_8X8;
-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-    int rate;
-    int distortion;
-    int64_t this_rd;
-
-    mbmi->uv_mode = mode;
-    vp9_build_intra_predictors_mbuv(&x->e_mbd);
-    vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
-                      x->e_mbd.predictor, x->src.uv_stride);
-    vp9_transform_mbuv_8x8(x);
-
-    vp9_quantize_mbuv_8x8(x);
-
-    rate_to = rd_cost_mbuv_8x8(&cpi->common, x, 1);
-    rate = rate_to + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];
-
-    distortion = vp9_mbuverror(x) / 4;
-    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
-
-    if (this_rd < best_rd) {
-      skip = vp9_mbuv_is_skippable_8x8(xd);
-      best_rd = this_rd;
-      d = distortion;
-      r = rate;
-      *rate_tokenonly = rate_to;
-      mode_selected = mode;
-    }
-  }
-  *rate = r;
-  *distortion = d;
-  *skippable = skip;
-  mbmi->uv_mode = mode_selected;
+  return cost;
 }
 
-// TODO(rbultje) very similar to rd_inter32x32_uv(), merge?
-static void super_block_uvrd(VP9_COMMON *const cm,
-                             MACROBLOCK *x,
-                             int *rate,
-                             int *distortion,
-                             int *skippable) {
+static void super_block_uvrd_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
+                                   int *rate, int *distortion, int *skip,
+                                   BLOCK_SIZE_TYPE bsize) {
+  const int bwl = mb_width_log2(bsize), bhl = mb_height_log2(bsize);
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
-  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
-  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
-
-  if (mbmi->txfm_size >= TX_16X16) {
-    vp9_subtract_sbuv_s_c(x->src_diff,
-                          usrc, vsrc, src_uv_stride,
-                          udst, vdst, dst_uv_stride,
-                          BLOCK_SIZE_SB32X32);
-    rd_inter32x32_uv_16x16(cm, x, rate, distortion, skippable, 1);
-  } else {
-    int d = 0, r = 0, n, s = 1;
-    ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
-    ENTROPY_CONTEXT_PLANES *ta_orig = xd->above_context;
-    ENTROPY_CONTEXT_PLANES *tl_orig = xd->left_context;
-
-    memcpy(t_above, xd->above_context, sizeof(t_above));
-    memcpy(t_left,  xd->left_context,  sizeof(t_left));
-
-    for (n = 0; n < 4; n++) {
-      int x_idx = n & 1, y_idx = n >> 1;
-
-      vp9_subtract_mbuv_s_c(x->src_diff,
-                            usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
-                            vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
-                            src_uv_stride,
-                            udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                            vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                            dst_uv_stride);
-      if (mbmi->txfm_size == TX_4X4) {
-        vp9_transform_mbuv_4x4(x);
-        vp9_quantize_mbuv_4x4(x);
-        s &= vp9_mbuv_is_skippable_4x4(xd);
-      } else {
-        vp9_transform_mbuv_8x8(x);
-        vp9_quantize_mbuv_8x8(x);
-        s &= vp9_mbuv_is_skippable_8x8(xd);
-      }
-
-      d += vp9_mbuverror(x) >> 2;
-      xd->above_context = t_above + x_idx;
-      xd->left_context = t_left + y_idx;
-      if (mbmi->txfm_size == TX_4X4) {
-        r += rd_cost_mbuv_4x4(cm, x, 0);
-      } else {
-        r += rd_cost_mbuv_8x8(cm, x, 0);
-      }
-    }
 
-    xd->above_context = ta_orig;
-    xd->left_context = tl_orig;
+  vp9_transform_sbuv_16x16(x, bsize);
+  vp9_quantize_sbuv_16x16(x, bsize);
 
-    *distortion = d;
-    *rate       = r;
-    *skippable  = s;
-  }
+  *rate       = rd_cost_sbuv_16x16(cm, x, bsize);
+  *distortion = vp9_sb_uv_block_error_c(x->coeff + (256 << (bwl + bhl)),
+                                        xd->plane[1].dqcoeff,
+                                        xd->plane[2].dqcoeff,
+                                        512 << (bwl + bhl - 2), 2);
+  *skip       = vp9_sbuv_is_skippable(xd, bsize, TX_16X16);
 }
 
-static int rd_cost_sb64uv_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
-                                int backup) {
-  int b;
-  int cost = 0;
+static int rd_cost_sbuv_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
+                              BLOCK_SIZE_TYPE bsize) {
+  const int bwl = mb_width_log2(bsize) - 2, bw = 1 << bwl;
+  const int bh = 1 << (mb_height_log2(bsize) - 2);
+  int yoff = 256 * bh * bw;
+  int p, b, cost = 0;
   MACROBLOCKD *const xd = &x->e_mbd;
   ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
-  ENTROPY_CONTEXT *ta, *tl;
-
-  if (backup) {
-    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES) * 4);
-    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES) * 4);
 
-    ta = (ENTROPY_CONTEXT *) &t_above;
-    tl = (ENTROPY_CONTEXT *) &t_left;
-  } else {
-    ta = (ENTROPY_CONTEXT *)xd->above_context;
-    tl = (ENTROPY_CONTEXT *)xd->left_context;
+  vpx_memcpy(&t_above, xd->above_context,
+             sizeof(ENTROPY_CONTEXT_PLANES) * 4 * bw);
+  vpx_memcpy(&t_left, xd->left_context,
+             sizeof(ENTROPY_CONTEXT_PLANES) * 4 * bh);
+
+  for (p = 0; p < 2; p++) {
+    for (b = 0; b < bw * bh; b++) {
+      const int x_idx = b * (bw - 1), y_idx = b >> bwl;
+      cost += cost_coeffs(cm, x, yoff + b * 64, PLANE_TYPE_UV,
+                          UVCTX(t_above[x_idx * 4], p),
+                          UVCTX(t_left[y_idx * 4], p),
+                          TX_32X32, 256 * bh * bw);
+    }
+    yoff = (yoff * 5) >> 2;  // u -> v
   }
 
-  for (b = 16; b < 24; b += 4)
-    cost += cost_coeffs(cm, x, b * 16, PLANE_TYPE_UV,
-                        ta + vp9_block2above[TX_8X8][b],
-                        tl + vp9_block2left[TX_8X8][b], TX_32X32, 256);
-
   return cost;
 }
+#undef UVCTX
 
-static void rd_inter64x64_uv_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
+static void super_block_uvrd_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
                                    int *rate, int *distortion, int *skip,
-                                   int backup) {
+                                   BLOCK_SIZE_TYPE bsize) {
+  const int bwl = mb_width_log2(bsize) - 1, bhl = mb_height_log2(bsize) - 1;
   MACROBLOCKD *const xd = &x->e_mbd;
 
-  vp9_transform_sbuv_32x32(x, BLOCK_SIZE_SB64X64);
-  vp9_quantize_sbuv_32x32(x, BLOCK_SIZE_SB64X64);
+  vp9_transform_sbuv_32x32(x, bsize);
+  vp9_quantize_sbuv_32x32(x, bsize);
 
-  *rate       = rd_cost_sb64uv_32x32(cm, x, backup);
-  *distortion = vp9_sb_uv_block_error_c(x->coeff + 4096,
+  *rate       = rd_cost_sbuv_32x32(cm, x, bsize);
+  *distortion = vp9_sb_uv_block_error_c(x->coeff + (1024 << (bwl + bhl)),
                                         xd->plane[1].dqcoeff,
-                                        xd->plane[2].dqcoeff, 2048, 0);
-  *skip       = vp9_sbuv_is_skippable(xd, BLOCK_SIZE_SB64X64, TX_32X32);
+                                        xd->plane[2].dqcoeff,
+                                        2048 << (bwl + bhl - 2), 0);
+  *skip       = vp9_sbuv_is_skippable(xd, bsize, TX_32X32);
 }
 
-static void super_block_64_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
-                                int *rate,
-                                int *distortion,
-                                int *skippable) {
+static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
+                             int *rate, int *distortion, int *skippable,
+                             BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
-  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
+  uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
+  uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
   int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
-  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
-  ENTROPY_CONTEXT_PLANES *ta_orig = xd->above_context;
-  ENTROPY_CONTEXT_PLANES *tl_orig = xd->left_context;
-  int d = 0, r = 0, n, s = 1;
-
-  // FIXME not needed if tx=32x32
-  memcpy(t_above, xd->above_context, sizeof(t_above));
-  memcpy(t_left,  xd->left_context,  sizeof(t_left));
 
-  if (mbmi->txfm_size == TX_32X32) {
-    vp9_subtract_sbuv_s_c(x->src_diff, usrc, vsrc, src_uv_stride,
-                          udst, vdst, dst_uv_stride,
-                          BLOCK_SIZE_SB64X64);
-    rd_inter64x64_uv_32x32(cm, x, &r, &d, &s, 1);
-  } else if (mbmi->txfm_size == TX_16X16) {
-    int n;
-
-    *rate = 0;
-    for (n = 0; n < 4; n++) {
-      int x_idx = n & 1, y_idx = n >> 1;
-      int r_tmp, d_tmp, s_tmp;
-
-      vp9_subtract_sbuv_s_c(x->src_diff,
-                            usrc + x_idx * 16 + y_idx * 16 * src_uv_stride,
-                            vsrc + x_idx * 16 + y_idx * 16 * src_uv_stride,
-                            src_uv_stride,
-                            udst + x_idx * 16 + y_idx * 16 * dst_uv_stride,
-                            vdst + x_idx * 16 + y_idx * 16 * dst_uv_stride,
-                            dst_uv_stride, BLOCK_SIZE_SB32X32);
-      xd->above_context = t_above + x_idx * 2;
-      xd->left_context = t_left + y_idx * 2;
-      rd_inter32x32_uv_16x16(cm, x, &r_tmp, &d_tmp, &s_tmp, 0);
-      r += r_tmp;
-      d += d_tmp;
-      s = s && s_tmp;
-    }
+  // FIXME(rbultje): mb code still predicts into xd->predictor
+  if (bsize == BLOCK_SIZE_MB16X16) {
+    vp9_subtract_mbuv(x->src_diff, usrc, vsrc, xd->predictor,
+                      x->src.uv_stride);
   } else {
-    for (n = 0; n < 16; n++) {
-      int x_idx = n & 3, y_idx = n >> 2;
-
-      vp9_subtract_mbuv_s_c(x->src_diff,
-                            usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
-                            vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
-                            src_uv_stride,
-                            udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                            vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                            dst_uv_stride);
-      if (mbmi->txfm_size == TX_4X4) {
-        vp9_transform_mbuv_4x4(x);
-        vp9_quantize_mbuv_4x4(x);
-        s &= vp9_mbuv_is_skippable_4x4(xd);
-      } else {
-        vp9_transform_mbuv_8x8(x);
-        vp9_quantize_mbuv_8x8(x);
-        s &= vp9_mbuv_is_skippable_8x8(xd);
-      }
-
-      xd->above_context = t_above + x_idx;
-      xd->left_context = t_left + y_idx;
-      d += vp9_mbuverror(x) >> 2;
-      if (mbmi->txfm_size == TX_4X4) {
-        r += rd_cost_mbuv_4x4(cm, x, 0);
-      } else {
-        r += rd_cost_mbuv_8x8(cm, x, 0);
-      }
-    }
+    vp9_subtract_sbuv_s_c(x->src_diff, usrc, vsrc, src_uv_stride,
+                          udst, vdst, dst_uv_stride, bsize);
   }
 
-  *distortion = d;
-  *rate       = r;
-  *skippable  = s;
-
-  xd->left_context = tl_orig;
-  xd->above_context = ta_orig;
+  if (mbmi->txfm_size >= TX_32X32 && bsize >= BLOCK_SIZE_SB64X64) {
+    super_block_uvrd_32x32(cm, x, rate, distortion, skippable, bsize);
+  } else if (mbmi->txfm_size >= TX_16X16 && bsize >= BLOCK_SIZE_SB32X32) {
+    super_block_uvrd_16x16(cm, x, rate, distortion, skippable, bsize);
+  } else if (mbmi->txfm_size >= TX_8X8) {
+    super_block_uvrd_8x8(cm, x, rate, distortion, skippable, bsize);
+  } else {
+    assert(mbmi->txfm_size == TX_4X4);
+    super_block_uvrd_4x4(cm, x, rate, distortion, skippable, bsize);
+  }
 }
 
-static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi,
-                                       MACROBLOCK *x,
-                                       int *rate,
-                                       int *rate_tokenonly,
-                                       int *distortion,
-                                       int *skippable) {
+static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                                       int *rate, int *rate_tokenonly,
+                                       int *distortion, int *skippable,
+                                       BLOCK_SIZE_TYPE bsize) {
   MB_PREDICTION_MODE mode;
   MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
   int64_t best_rd = INT64_MAX, this_rd;
@@ -2273,10 +1659,17 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi,
 
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
     x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
-    vp9_build_intra_predictors_sbuv_s(&x->e_mbd);
+    if (bsize == BLOCK_SIZE_MB16X16) {
+      vp9_build_intra_predictors_mbuv(&x->e_mbd);
+    } else if (bsize == BLOCK_SIZE_SB32X32) {
+      vp9_build_intra_predictors_sbuv_s(&x->e_mbd);
+    } else {
+      assert(bsize == BLOCK_SIZE_SB64X64);
+      vp9_build_intra_predictors_sb64uv_s(&x->e_mbd);
+    }
 
     super_block_uvrd(&cpi->common, x, &this_rate_tokenonly,
-                     &this_distortion, &s);
+                     &this_distortion, &s, bsize);
     this_rate = this_rate_tokenonly +
                 x->intra_uv_mode_cost[x->e_mbd.frame_type][mode];
     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
@@ -2296,43 +1689,6 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi,
   return best_rd;
 }
 
-static int64_t rd_pick_intra_sb64uv_mode(VP9_COMP *cpi,
-                                         MACROBLOCK *x,
-                                         int *rate,
-                                         int *rate_tokenonly,
-                                         int *distortion,
-                                         int *skippable) {
-  MB_PREDICTION_MODE mode;
-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
-  int64_t best_rd = INT64_MAX, this_rd;
-  int this_rate_tokenonly, this_rate;
-  int this_distortion, s;
-
-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-    x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
-    vp9_build_intra_predictors_sb64uv_s(&x->e_mbd);
-
-    super_block_64_uvrd(&cpi->common, x, &this_rate_tokenonly,
-                        &this_distortion, &s);
-    this_rate = this_rate_tokenonly +
-    x->intra_uv_mode_cost[x->e_mbd.frame_type][mode];
-    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
-
-    if (this_rd < best_rd) {
-      mode_selected   = mode;
-      best_rd         = this_rd;
-      *rate           = this_rate;
-      *rate_tokenonly = this_rate_tokenonly;
-      *distortion     = this_distortion;
-      *skippable      = s;
-    }
-  }
-
-  x->e_mbd.mode_info_context->mbmi.uv_mode = mode_selected;
-
-  return best_rd;
-}
-
 int vp9_cost_mv_ref(VP9_COMP *cpi,
                     MB_PREDICTION_MODE m,
                     const int mode_context) {
@@ -3436,35 +2792,6 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
   memcpy(ctx->txfm_rd_diff, txfm_size_diff, sizeof(ctx->txfm_rd_diff));
 }
 
-static void inter_mode_cost(VP9_COMP *cpi, MACROBLOCK *x,
-                            int *rate2, int *distortion2, int *rate_y,
-                            int *distortion, int* rate_uv, int *distortion_uv,
-                            int *skippable, int64_t txfm_cache[NB_TXFM_MODES]) {
-  int y_skippable, uv_skippable;
-
-  // Y cost and distortion
-  macro_block_yrd(cpi, x, rate_y, distortion, &y_skippable, txfm_cache);
-
-  *rate2 += *rate_y;
-  *distortion2 += *distortion;
-
-  // UV cost and distortion
-  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
-                    x->e_mbd.predictor, x->src.uv_stride);
-  if (x->e_mbd.mode_info_context->mbmi.txfm_size != TX_4X4 &&
-      x->e_mbd.mode_info_context->mbmi.mode != I8X8_PRED &&
-      x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
-    rd_inter16x16_uv_8x8(cpi, x, rate_uv, distortion_uv,
-                         &uv_skippable, 1);
-  else
-    rd_inter16x16_uv_4x4(cpi, x, rate_uv, distortion_uv,
-                         &uv_skippable, 1);
-
-  *rate2 += *rate_uv;
-  *distortion2 += *distortion_uv;
-  *skippable = y_skippable && uv_skippable;
-}
-
 static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
                                int idx, MV_REFERENCE_FRAME frame_type,
                                int block_size,
@@ -3569,7 +2896,7 @@ static void model_rd_from_var_lapndz(int var, int n, int qstep,
 }
 
 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                                 enum BlockSize block_size,
+                                 BLOCK_SIZE_TYPE bsize,
                                  int *saddone, int near_sadidx[],
                                  int mdcounts[4], int64_t txfm_cache[],
                                  int *rate2, int *distortion, int *skippable,
@@ -3586,6 +2913,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                                 [MAX_REF_FRAMES],
                                  YV12_BUFFER_CONFIG *scaled_ref_frame,
                                  int mb_row, int mb_col) {
+  const enum BlockSize block_size =
+      (bsize == BLOCK_SIZE_MB16X16) ? BLOCK_16X16 :
+      (bsize == BLOCK_SIZE_SB32X32) ? BLOCK_32X32 : BLOCK_64X64;
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
@@ -3755,7 +3085,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                  (mbmi->mv[1].as_mv.col & 15) == 0;
   // Search for best switchable filter by checking the variance of
   // pred error irrespective of whether the filter will be used
-  if (block_size == BLOCK_64X64) {
+  if (bsize == BLOCK_SIZE_SB64X64) {
     int switchable_filter_index, newbest;
     int tmp_rate_y_i = 0, tmp_rate_u_i = 0, tmp_rate_v_i = 0;
     int tmp_dist_y_i = 0, tmp_dist_u_i = 0, tmp_dist_v_i = 0;
@@ -3835,7 +3165,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       interpolating_intpel_seen |=
         intpel_mv && vp9_is_interpolating_filter[mbmi->interp_filter];
     }
-  } else if (block_size == BLOCK_32X32) {
+  } else if (bsize == BLOCK_SIZE_SB32X32) {
     int switchable_filter_index, newbest;
     int tmp_rate_y_i = 0, tmp_rate_u_i = 0, tmp_rate_v_i = 0;
     int tmp_dist_y_i = 0, tmp_dist_u_i = 0, tmp_dist_v_i = 0;
@@ -3918,7 +3248,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     int switchable_filter_index, newbest;
     int tmp_rate_y_i = 0, tmp_rate_u_i = 0, tmp_rate_v_i = 0;
     int tmp_dist_y_i = 0, tmp_dist_u_i = 0, tmp_dist_v_i = 0;
-    assert(block_size == BLOCK_16X16);
+    assert(bsize == BLOCK_SIZE_MB16X16);
     for (switchable_filter_index = 0;
        switchable_filter_index < VP9_SWITCHABLE_FILTERS;
        ++switchable_filter_index) {
@@ -3997,7 +3327,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
 
   if (pred_exists) {
-    if (block_size == BLOCK_64X64) {
+    if (bsize == BLOCK_SIZE_SB64X64) {
       for (i = 0; i < 64; ++i)
         vpx_memcpy(xd->dst.y_buffer + i * xd->dst.y_stride,  tmp_ybuf + i * 64,
                    sizeof(unsigned char) * 64);
@@ -4007,7 +3337,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       for (i = 0; i < 32; ++i)
         vpx_memcpy(xd->dst.v_buffer + i * xd->dst.uv_stride, tmp_vbuf + i * 32,
                    sizeof(unsigned char) * 32);
-    } else if (block_size == BLOCK_32X32) {
+    } else if (bsize == BLOCK_SIZE_SB32X32) {
       for (i = 0; i < 32; ++i)
         vpx_memcpy(xd->dst.y_buffer + i * xd->dst.y_stride,  tmp_ybuf + i * 64,
                    sizeof(unsigned char) * 32);
@@ -4025,9 +3355,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   } else {
     // Handles the special case when a filter that is not in the
     // switchable list (ex. bilinear, 6-tap) is indicated at the frame level
-    if (block_size == BLOCK_64X64) {
+    if (bsize == BLOCK_SIZE_SB64X64) {
       vp9_build_inter64x64_predictors_sb(xd, mb_row, mb_col);
-    } else if (block_size == BLOCK_32X32) {
+    } else if (bsize == BLOCK_SIZE_SB32X32) {
       vp9_build_inter32x32_predictors_sb(xd, mb_row, mb_col);
     } else {
       vp9_build_inter16x16_predictors_mb(xd, xd->predictor,
@@ -4053,14 +3383,14 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     if (threshold < x->encode_breakout)
       threshold = x->encode_breakout;
 
-    if (block_size == BLOCK_64X64) {
+    if (bsize == BLOCK_SIZE_SB64X64) {
       var = vp9_variance64x64(*(b->base_src), b->src_stride,
                               xd->dst.y_buffer, xd->dst.y_stride, &sse);
-    } else if (block_size == BLOCK_32X32) {
+    } else if (bsize == BLOCK_SIZE_SB32X32) {
       var = vp9_variance32x32(*(b->base_src), b->src_stride,
                               xd->dst.y_buffer, xd->dst.y_stride, &sse);
     } else {
-      assert(block_size == BLOCK_16X16);
+      assert(bsize == BLOCK_SIZE_MB16X16);
       var = vp9_variance16x16(*(b->base_src), b->src_stride,
                               xd->predictor, 16, &sse);
     }
@@ -4074,14 +3404,14 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         // Check u and v to make sure skip is ok
         int sse2;
 
-        if (block_size == BLOCK_64X64) {
+        if (bsize == BLOCK_SIZE_SB64X64) {
           unsigned int sse2u, sse2v;
           var = vp9_variance32x32(x->src.u_buffer, x->src.uv_stride,
                                   xd->dst.u_buffer, xd->dst.uv_stride, &sse2u);
           var = vp9_variance32x32(x->src.v_buffer, x->src.uv_stride,
                                   xd->dst.v_buffer, xd->dst.uv_stride, &sse2v);
           sse2 = sse2u + sse2v;
-        } else if (block_size == BLOCK_32X32) {
+        } else if (bsize == BLOCK_SIZE_SB32X32) {
           unsigned int sse2u, sse2v;
           var = vp9_variance16x16(x->src.u_buffer, x->src.uv_stride,
                                   xd->dst.u_buffer, xd->dst.uv_stride, &sse2u);
@@ -4089,7 +3419,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                   xd->dst.v_buffer, xd->dst.uv_stride, &sse2v);
           sse2 = sse2u + sse2v;
         } else {
-          assert(block_size == BLOCK_16X16);
+          assert(bsize == BLOCK_SIZE_MB16X16);
           sse2 = vp9_uvsse(x);
         }
 
@@ -4110,42 +3440,20 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   }
 
   if (!x->skip) {
-    if (block_size == BLOCK_64X64) {
-      int skippable_y, skippable_uv;
-
-      // Y cost and distortion
-      super_block_64_yrd(cpi, x, rate_y, distortion_y,
-                         &skippable_y, txfm_cache);
-      *rate2 += *rate_y;
-      *distortion += *distortion_y;
-
-      rd_inter64x64_uv(cpi, x, rate_uv, distortion_uv,
-                       &skippable_uv);
-
-      *rate2 += *rate_uv;
-      *distortion += *distortion_uv;
-      *skippable = skippable_y && skippable_uv;
-    } else if (block_size == BLOCK_32X32) {
-      int skippable_y, skippable_uv;
-
-      // Y cost and distortion
-      super_block_yrd(cpi, x, rate_y, distortion_y,
-                      &skippable_y, txfm_cache);
-      *rate2 += *rate_y;
-      *distortion += *distortion_y;
-
-      rd_inter32x32_uv(cpi, x, rate_uv, distortion_uv,
-                       &skippable_uv);
-
-      *rate2 += *rate_uv;
-      *distortion += *distortion_uv;
-      *skippable = skippable_y && skippable_uv;
-    } else {
-      assert(block_size == BLOCK_16X16);
-      inter_mode_cost(cpi, x, rate2, distortion,
-                      rate_y, distortion_y, rate_uv, distortion_uv,
-                      skippable, txfm_cache);
-    }
+    int skippable_y, skippable_uv;
+
+    // Y cost and distortion
+    super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y,
+                    bsize, txfm_cache);
+    *rate2 += *rate_y;
+    *distortion += *distortion_y;
+
+    super_block_uvrd(cm, x, rate_uv, distortion_uv,
+                     &skippable_uv, bsize);
+
+    *rate2 += *rate_uv;
+    *distortion += *distortion_uv;
+    *skippable = skippable_y && skippable_uv;
   }
 
   if (!(*mode_excluded)) {
@@ -4201,17 +3509,13 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t best_overall_rd = INT64_MAX;
   INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE;
   INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;
-  int uv_intra_rate, uv_intra_distortion, uv_intra_rate_tokenonly;
-  int uv_intra_skippable = 0;
-  int uv_intra_rate_8x8 = 0, uv_intra_distortion_8x8 = 0, uv_intra_rate_tokenonly_8x8 = 0;
-  int uv_intra_skippable_8x8 = 0;
+  int uv_intra_rate[2], uv_intra_distortion[2], uv_intra_rate_tokenonly[2];
+  int uv_intra_skippable[2];
+  MB_PREDICTION_MODE uv_intra_mode[2];
   int rate_y, UNINITIALIZED_IS_SAFE(rate_uv);
   int distortion_uv = INT_MAX;
   int64_t best_yrd = INT64_MAX;
 
-  MB_PREDICTION_MODE uv_intra_mode;
-  MB_PREDICTION_MODE uv_intra_mode_8x8 = 0;
-
   int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
   int saddone = 0;
 
@@ -4280,18 +3584,14 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
   xd->mode_info_context->mbmi.mode = DC_PRED;
 
-  rd_pick_intra_mbuv_mode(cpi, x, &uv_intra_rate,
-                          &uv_intra_rate_tokenonly, &uv_intra_distortion,
-                          &uv_intra_skippable);
-  uv_intra_mode = mbmi->uv_mode;
-
-  /* rough estimate for now */
-  if (cpi->common.txfm_mode != ONLY_4X4) {
-    rd_pick_intra_mbuv_mode_8x8(cpi, x, &uv_intra_rate_8x8,
-                                &uv_intra_rate_tokenonly_8x8,
-                                &uv_intra_distortion_8x8,
-                                &uv_intra_skippable_8x8);
-    uv_intra_mode_8x8 = mbmi->uv_mode;
+  for (i = 0; i <= TX_8X8; i++) {
+    mbmi->txfm_size = i;
+    rd_pick_intra_sbuv_mode(cpi, x, &uv_intra_rate[i],
+                            &uv_intra_rate_tokenonly[i],
+                            &uv_intra_distortion[i],
+                            &uv_intra_skippable[i],
+                            BLOCK_SIZE_MB16X16);
+    uv_intra_mode[i] = mbmi->uv_mode;
   }
 
   // Get estimates of reference frame costs for each reference frame
@@ -4454,23 +3754,18 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
           mbmi->ref_frame = INTRA_FRAME;
           // FIXME compound intra prediction
           vp9_build_intra_predictors_mby(&x->e_mbd);
-          macro_block_yrd(cpi, x, &rate_y, &distortion, &skippable, txfm_cache);
+          super_block_yrd(cpi, x, &rate_y, &distortion, &skippable,
+                          BLOCK_SIZE_MB16X16, txfm_cache);
           rate2 += rate_y;
           distortion2 += distortion;
           rate2 += x->mbmode_cost[xd->frame_type][mbmi->mode];
-          if (mbmi->txfm_size != TX_4X4) {
-            rate2 += uv_intra_rate_8x8;
-            rate_uv = uv_intra_rate_tokenonly_8x8;
-            distortion2 += uv_intra_distortion_8x8;
-            distortion_uv = uv_intra_distortion_8x8;
-            skippable = skippable && uv_intra_skippable_8x8;
-          } else {
-            rate2 += uv_intra_rate;
-            rate_uv = uv_intra_rate_tokenonly;
-            distortion2 += uv_intra_distortion;
-            distortion_uv = uv_intra_distortion;
-            skippable = skippable && uv_intra_skippable;
-          }
+
+          rate2 += uv_intra_rate[mbmi->txfm_size != TX_4X4];
+          rate_uv = uv_intra_rate_tokenonly[mbmi->txfm_size != TX_4X4];
+          distortion2 += uv_intra_distortion[mbmi->txfm_size != TX_4X4];
+          distortion_uv = uv_intra_distortion[mbmi->txfm_size != TX_4X4];
+          skippable = skippable &&
+                      uv_intra_skippable[mbmi->txfm_size != TX_4X4];
           break;
         case B_PRED: {
           int64_t tmp_rd;
@@ -4485,10 +3780,10 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
           distortion2 += distortion;
 
           if (tmp_rd < best_yrd) {
-            rate2 += uv_intra_rate;
-            rate_uv = uv_intra_rate_tokenonly;
-            distortion2 += uv_intra_distortion;
-            distortion_uv = uv_intra_distortion;
+            rate2 += uv_intra_rate[TX_4X4];
+            rate_uv = uv_intra_rate_tokenonly[TX_4X4];
+            distortion2 += uv_intra_distortion[TX_4X4];
+            distortion_uv = uv_intra_distortion[TX_4X4];
           } else {
             this_rd = INT64_MAX;
             disable_skip = 1;
@@ -4508,10 +3803,10 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
           /* TODO: uv rate maybe over-estimated here since there is UV intra
                    mode coded in I8X8_PRED prediction */
           if (tmp_rd < best_yrd) {
-            rate2 += uv_intra_rate;
-            rate_uv = uv_intra_rate_tokenonly;
-            distortion2 += uv_intra_distortion;
-            distortion_uv = uv_intra_distortion;
+            rate2 += uv_intra_rate[TX_4X4];
+            rate_uv = uv_intra_rate_tokenonly[TX_4X4];
+            distortion2 += uv_intra_distortion[TX_4X4];
+            distortion_uv = uv_intra_distortion[TX_4X4];
           } else {
             this_rd = INT64_MAX;
             disable_skip = 1;
@@ -4636,8 +3931,8 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         vp9_build_inter4x4_predictors_mbuv(&x->e_mbd, mb_row, mb_col);
         vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
                           x->e_mbd.predictor, x->src.uv_stride);
-        rd_inter16x16_uv_4x4(cpi, x, &rate_uv, &distortion_uv,
-                             &uv_skippable, 1);
+        super_block_uvrd_4x4(cm, x, &rate_uv, &distortion_uv,
+                             &uv_skippable, BLOCK_SIZE_MB16X16);
         rate2 += rate_uv;
         distortion2 += distortion_uv;
         skippable = skippable && uv_skippable;
@@ -4669,7 +3964,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 #endif
       }
 #endif
-      this_rd = handle_inter_mode(cpi, x, BLOCK_16X16,
+      this_rd = handle_inter_mode(cpi, x, BLOCK_SIZE_MB16X16,
                                   &saddone, near_sadidx, mdcounts, txfm_cache,
                                   &rate2, &distortion2, &skippable,
                                   &compmode_cost,
@@ -4759,8 +4054,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       best_intra16_rd = this_rd;
       best_intra16_mode = this_mode;
 #if SEPARATE_INTERINTRA_UV
-      best_intra16_uv_mode = (mbmi->txfm_size != TX_4X4 ?
-                              uv_intra_mode_8x8 : uv_intra_mode);
+      best_intra16_uv_mode = uv_intra_mode[mbmi->txfm_size != TX_4X4];
 #endif
     }
 #endif
@@ -4793,9 +4087,9 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
           if (mbmi->txfm_size != TX_4X4
               && this_mode != B_PRED
               && this_mode != I8X8_PRED)
-            mbmi->uv_mode = uv_intra_mode_8x8;
+            mbmi->uv_mode = uv_intra_mode[TX_8X8];
           else
-            mbmi->uv_mode = uv_intra_mode;
+            mbmi->uv_mode = uv_intra_mode[TX_4X4];
           /* required for left and above block mv */
           mbmi->mv[0].as_int = 0;
         }
@@ -4997,9 +4291,9 @@ end:
                        best_pred_diff, best_txfm_diff);
 }
 
-void vp9_rd_pick_intra_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,
-                                 int *returnrate,
-                                 int *returndist) {
+void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+                               int *returnrate, int *returndist,
+                               BLOCK_SIZE_TYPE bsize) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   int rate_y = 0, rate_uv;
@@ -5011,58 +4305,32 @@ void vp9_rd_pick_intra_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,
 
   xd->mode_info_context->mbmi.mode = DC_PRED;
   err = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
-                               &dist_y, &y_skip, txfm_cache);
+                               &dist_y, &y_skip, bsize, txfm_cache);
   rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
-                          &dist_uv, &uv_skip);
+                          &dist_uv, &uv_skip, bsize);
 
   if (cpi->common.mb_no_coeff_skip && y_skip && uv_skip) {
     *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
                   vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);
     *returndist = dist_y + (dist_uv >> 2);
-    memset(x->sb32_context[xd->sb_index].txfm_rd_diff, 0,
-           sizeof(x->sb32_context[xd->sb_index].txfm_rd_diff));
-  } else {
-    *returnrate = rate_y + rate_uv;
-    if (cpi->common.mb_no_coeff_skip)
-      *returnrate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
-    *returndist = dist_y + (dist_uv >> 2);
-    for (i = 0; i < NB_TXFM_MODES; i++) {
-      x->sb32_context[xd->sb_index].txfm_rd_diff[i] = err - txfm_cache[i];
+    if (bsize == BLOCK_SIZE_SB32X32) {
+      memset(x->sb32_context[xd->sb_index].txfm_rd_diff, 0,
+             sizeof(x->sb32_context[xd->sb_index].txfm_rd_diff));
+    } else {
+      memset(x->sb64_context.txfm_rd_diff, 0,
+             sizeof(x->sb64_context.txfm_rd_diff));
     }
-  }
-}
-
-void vp9_rd_pick_intra_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,
-                                 int *returnrate,
-                                 int *returndist) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  int rate_y = 0, rate_uv;
-  int rate_y_tokenonly = 0, rate_uv_tokenonly;
-  int dist_y = 0, dist_uv;
-  int y_skip = 0, uv_skip;
-  int64_t txfm_cache[NB_TXFM_MODES], err;
-  int i;
-
-  xd->mode_info_context->mbmi.mode = DC_PRED;
-  err = rd_pick_intra_sb64y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
-                                 &dist_y, &y_skip, txfm_cache);
-  rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
-                            &dist_uv, &uv_skip);
-
-  if (cpi->common.mb_no_coeff_skip && y_skip && uv_skip) {
-    *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
-    vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);
-    *returndist = dist_y + (dist_uv >> 2);
-    memset(x->sb64_context.txfm_rd_diff, 0,
-           sizeof(x->sb64_context.txfm_rd_diff));
   } else {
     *returnrate = rate_y + rate_uv;
-    if (cm->mb_no_coeff_skip)
+    if (cpi->common.mb_no_coeff_skip)
       *returnrate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
     *returndist = dist_y + (dist_uv >> 2);
     for (i = 0; i < NB_TXFM_MODES; i++) {
-      x->sb64_context.txfm_rd_diff[i] = err - txfm_cache[i];
+      if (bsize == BLOCK_SIZE_SB32X32) {
+        x->sb32_context[xd->sb_index].txfm_rd_diff[i] = err - txfm_cache[i];
+      } else {
+        x->sb64_context.txfm_rd_diff[i] = err - txfm_cache[i];
+      }
     }
   }
 }
@@ -5073,19 +4341,19 @@ void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
   int64_t error4x4, error16x16;
-  int rate4x4, rate16x16 = 0, rateuv, rateuv8x8;
-  int dist4x4 = 0, dist16x16 = 0, distuv = 0, distuv8x8 = 0;
+  int rate4x4, rate16x16 = 0, rateuv[2];
+  int dist4x4 = 0, dist16x16 = 0, distuv[2];
   int rate;
   int rate4x4_tokenonly = 0;
   int rate16x16_tokenonly = 0;
-  int rateuv_tokenonly = 0, rateuv8x8_tokenonly = 0;
+  int rateuv_tokenonly[2];
   int64_t error8x8;
   int rate8x8_tokenonly=0;
   int rate8x8, dist8x8;
   int mode16x16;
   int mode8x8[4];
   int dist;
-  int modeuv, modeuv8x8, uv_intra_skippable, uv_intra_skippable_8x8;
+  int modeuv[2], uv_intra_skippable[2];
   int y_intra16x16_skippable = 0;
   int64_t txfm_cache[2][NB_TXFM_MODES];
   TX_SIZE txfm_size_16x16, txfm_size_8x8;
@@ -5093,31 +4361,24 @@ void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
   mbmi->ref_frame = INTRA_FRAME;
   mbmi->mode = DC_PRED;
-  rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv,
-                          &uv_intra_skippable);
-  modeuv = mbmi->uv_mode;
-  if (cpi->common.txfm_mode != ONLY_4X4) {
-    rd_pick_intra_mbuv_mode_8x8(cpi, x, &rateuv8x8, &rateuv8x8_tokenonly,
-                                &distuv8x8, &uv_intra_skippable_8x8);
-    modeuv8x8 = mbmi->uv_mode;
-  } else {
-    uv_intra_skippable_8x8 = uv_intra_skippable;
-    rateuv8x8 = rateuv;
-    distuv8x8 = distuv;
-    rateuv8x8_tokenonly = rateuv_tokenonly;
-    modeuv8x8 = modeuv;
+  for (i = 0; i <= TX_8X8; i++) {
+    mbmi->txfm_size = i;
+    rd_pick_intra_sbuv_mode(cpi, x, &rateuv[i], &rateuv_tokenonly[i],
+                            &distuv[i], &uv_intra_skippable[i],
+                            BLOCK_SIZE_MB16X16);
+    modeuv[i] = mbmi->uv_mode;
   }
 
   // current macroblock under rate-distortion optimization test loop
-  error16x16 = rd_pick_intra16x16mby_mode(cpi, x, &rate16x16,
-                                          &rate16x16_tokenonly, &dist16x16,
-                                          &y_intra16x16_skippable,
-                                          txfm_cache[1]);
+  error16x16 = rd_pick_intra_sby_mode(cpi, x, &rate16x16,
+                                      &rate16x16_tokenonly, &dist16x16,
+                                      &y_intra16x16_skippable,
+                                      BLOCK_SIZE_MB16X16, txfm_cache[1]);
   mode16x16 = mbmi->mode;
   txfm_size_16x16 = mbmi->txfm_size;
   if (cpi->common.mb_no_coeff_skip && y_intra16x16_skippable &&
-      ((cm->txfm_mode == ONLY_4X4 && uv_intra_skippable) ||
-       (cm->txfm_mode != ONLY_4X4 && uv_intra_skippable_8x8))) {
+      ((cm->txfm_mode == ONLY_4X4 && uv_intra_skippable[TX_4X4]) ||
+       (cm->txfm_mode != ONLY_4X4 && uv_intra_skippable[TX_8X8]))) {
     error16x16 -= RDCOST(x->rdmult, x->rddiv, rate16x16_tokenonly, 0);
     rate16x16 -= rate16x16_tokenonly;
   }
@@ -5148,48 +4409,46 @@ void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
   mbmi->mb_skip_coeff = 0;
   if (cpi->common.mb_no_coeff_skip && y_intra16x16_skippable &&
-      ((cm->txfm_mode == ONLY_4X4 && uv_intra_skippable) ||
-       (cm->txfm_mode != ONLY_4X4 && uv_intra_skippable_8x8))) {
+      ((cm->txfm_mode == ONLY_4X4 && uv_intra_skippable[TX_4X4]) ||
+       (cm->txfm_mode != ONLY_4X4 && uv_intra_skippable[TX_8X8]))) {
     mbmi->mb_skip_coeff = 1;
     mbmi->mode = mode16x16;
-    mbmi->uv_mode = (cm->txfm_mode == ONLY_4X4) ? modeuv : modeuv8x8;
+    mbmi->uv_mode = modeuv[cm->txfm_mode != ONLY_4X4];
     rate = rate16x16 + vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);
     dist = dist16x16;
-    if (cm->txfm_mode == ONLY_4X4) {
-      rate += rateuv - rateuv_tokenonly;
-      dist += (distuv >> 2);
-    } else {
-      rate += rateuv8x8 - rateuv8x8_tokenonly;
-      dist += (distuv8x8 >> 2);
-    }
-
+    rate += rateuv[cm->txfm_mode != ONLY_4X4] -
+            rateuv_tokenonly[cm->txfm_mode != ONLY_4X4];
+    dist += (distuv[cm->txfm_mode != ONLY_4X4] >> 2);
     mbmi->txfm_size = txfm_size_16x16;
   } else if (error8x8 > error16x16) {
     if (error4x4 < error16x16) {
-      rate = rateuv + rate4x4;
+      rate = rateuv[TX_4X4] + rate4x4;
       mbmi->mode = B_PRED;
       mbmi->txfm_size = TX_4X4;
-      dist = dist4x4 + (distuv >> 2);
+      dist = dist4x4 + (distuv[TX_4X4] >> 2);
+      mbmi->uv_mode = modeuv[TX_4X4];
     } else {
       mbmi->txfm_size = txfm_size_16x16;
       mbmi->mode = mode16x16;
-      rate = rate16x16 + rateuv8x8;
-      dist = dist16x16 + (distuv8x8 >> 2);
+      rate = rate16x16 + rateuv[mbmi->txfm_size != TX_4X4];
+      dist = dist16x16 + (distuv[mbmi->txfm_size != TX_4X4] >> 2);
+      mbmi->uv_mode = modeuv[mbmi->txfm_size != TX_4X4];
     }
     if (cpi->common.mb_no_coeff_skip)
       rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
   } else {
     if (error4x4 < error8x8) {
-      rate = rateuv + rate4x4;
+      rate = rateuv[TX_4X4] + rate4x4;
       mbmi->mode = B_PRED;
       mbmi->txfm_size = TX_4X4;
-      dist = dist4x4 + (distuv >> 2);
+      dist = dist4x4 + (distuv[TX_4X4] >> 2);
+      mbmi->uv_mode = modeuv[TX_4X4];
     } else {
       mbmi->mode = I8X8_PRED;
       mbmi->txfm_size = txfm_size_8x8;
       set_i8x8_block_modes(x, mode8x8);
-      rate = rate8x8 + rateuv;
-      dist = dist8x8 + (distuv >> 2);
+      rate = rate8x8 + rateuv[TX_4X4];
+      dist = dist8x8 + (distuv[TX_4X4] >> 2);
     }
     if (cpi->common.mb_no_coeff_skip)
       rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
@@ -5204,11 +4463,13 @@ void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
   *returndist = dist;
 }
 
-static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
-                                         int mb_row, int mb_col,
-                                         int *returnrate,
-                                         int *returndistortion,
-                                         int block_size) {
+int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+                                  int mb_row, int mb_col,
+                                  int *returnrate,
+                                  int *returndistortion,
+                                  BLOCK_SIZE_TYPE bsize) {
+  const int block_size = (bsize == BLOCK_SIZE_SB64X64) ?
+                          BLOCK_64X64 : BLOCK_32X32;
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
@@ -5248,13 +4509,9 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t best_overall_rd = INT64_MAX;
   INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE;
   INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;
-  int rate_uv_4x4 = 0, rate_uv_8x8 = 0, rate_uv_tokenonly_4x4 = 0,
-      rate_uv_tokenonly_8x8 = 0;
-  int dist_uv_4x4 = 0, dist_uv_8x8 = 0, uv_skip_4x4 = 0, uv_skip_8x8 = 0;
-  MB_PREDICTION_MODE mode_uv_4x4 = NEARESTMV, mode_uv_8x8 = NEARESTMV;
-  int rate_uv_16x16 = 0, rate_uv_tokenonly_16x16 = 0;
-  int dist_uv_16x16 = 0, uv_skip_16x16 = 0;
-  MB_PREDICTION_MODE mode_uv_16x16 = NEARESTMV;
+  int rate_uv_intra[TX_SIZE_MAX_SB], rate_uv_tokenonly[TX_SIZE_MAX_SB];
+  int dist_uv[TX_SIZE_MAX_SB], skip_uv[TX_SIZE_MAX_SB];
+  MB_PREDICTION_MODE mode_uv[TX_SIZE_MAX_SB];
   struct scale_factors scale_factor[4];
 
   xd->mode_info_context->mbmi.segment_id = segment_id;
@@ -5277,48 +4534,12 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
 
-  if (block_size == BLOCK_64X64) {
-    mbmi->mode = DC_PRED;
-    if (cm->txfm_mode == ONLY_4X4 || cm->txfm_mode == TX_MODE_SELECT) {
-      mbmi->txfm_size = TX_4X4;
-      rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv_4x4, &rate_uv_tokenonly_4x4,
-                                &dist_uv_4x4, &uv_skip_4x4);
-      mode_uv_4x4 = mbmi->uv_mode;
-    }
-    if (cm->txfm_mode != ONLY_4X4) {
-      mbmi->txfm_size = TX_8X8;
-      rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv_8x8, &rate_uv_tokenonly_8x8,
-                                &dist_uv_8x8, &uv_skip_8x8);
-      mode_uv_8x8 = mbmi->uv_mode;
-    }
-    if (cm->txfm_mode >= ALLOW_32X32) {
-      mbmi->txfm_size = TX_32X32;
-      rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv_16x16,
-                                &rate_uv_tokenonly_16x16,
-                                &dist_uv_16x16, &uv_skip_16x16);
-      mode_uv_16x16 = mbmi->uv_mode;
-    }
-  } else {
-    assert(block_size == BLOCK_32X32);
-    mbmi->mode = DC_PRED;
-    if (cm->txfm_mode == ONLY_4X4 || cm->txfm_mode == TX_MODE_SELECT) {
-      mbmi->txfm_size = TX_4X4;
-      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_4x4, &rate_uv_tokenonly_4x4,
-                              &dist_uv_4x4, &uv_skip_4x4);
-      mode_uv_4x4 = mbmi->uv_mode;
-    }
-    if (cm->txfm_mode != ONLY_4X4) {
-      mbmi->txfm_size = TX_8X8;
-      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_8x8, &rate_uv_tokenonly_8x8,
-                              &dist_uv_8x8, &uv_skip_8x8);
-      mode_uv_8x8 = mbmi->uv_mode;
-    }
-    if (cm->txfm_mode >= ALLOW_32X32) {
-      mbmi->txfm_size = TX_32X32;
-      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_16x16, &rate_uv_tokenonly_16x16,
-                              &dist_uv_16x16, &uv_skip_16x16);
-      mode_uv_16x16 = mbmi->uv_mode;
-    }
+  mbmi->mode = DC_PRED;
+  for (i = 0; i <= ((bsize < BLOCK_SIZE_SB64X64) ? TX_16X16 : TX_32X32); i++) {
+    mbmi->txfm_size = i;
+    rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[i], &rate_uv_tokenonly[i],
+                            &dist_uv[i], &skip_uv[i], bsize);
+    mode_uv[i] = mbmi->uv_mode;
   }
 
   for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
@@ -5433,32 +4654,27 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     }
 
     if (ref_frame == INTRA_FRAME) {
-      if (block_size == BLOCK_64X64) {
+      TX_SIZE uv_tx;
+
+      if (bsize == BLOCK_SIZE_SB64X64) {
         vp9_build_intra_predictors_sb64y_s(xd);
-        super_block_64_yrd(cpi, x, &rate_y, &distortion_y,
-                           &skippable, txfm_cache);
       } else {
-        assert(block_size == BLOCK_32X32);
+        assert(bsize == BLOCK_SIZE_SB32X32);
         vp9_build_intra_predictors_sby_s(xd);
-        super_block_yrd(cpi, x, &rate_y, &distortion_y,
-                        &skippable, txfm_cache);
-      }
-      if (mbmi->txfm_size == TX_4X4) {
-        rate_uv = rate_uv_4x4;
-        distortion_uv = dist_uv_4x4;
-        skippable = skippable && uv_skip_4x4;
-        mbmi->uv_mode = mode_uv_4x4;
-      } else if (mbmi->txfm_size == TX_32X32) {
-        rate_uv = rate_uv_16x16;
-        distortion_uv = dist_uv_16x16;
-        skippable = skippable && uv_skip_16x16;
-        mbmi->uv_mode = mode_uv_16x16;
-      } else {
-        rate_uv = rate_uv_8x8;
-        distortion_uv = dist_uv_8x8;
-        skippable = skippable && uv_skip_8x8;
-        mbmi->uv_mode = mode_uv_8x8;
       }
+      super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
+                      bsize, txfm_cache);
+
+      uv_tx = mbmi->txfm_size;
+      if (bsize < BLOCK_SIZE_SB32X32 && uv_tx == TX_16X16)
+        uv_tx = TX_8X8;
+      else if (bsize < BLOCK_SIZE_SB64X64 && uv_tx == TX_32X32)
+        uv_tx = TX_16X16;
+
+      rate_uv = rate_uv_intra[uv_tx];
+      distortion_uv = dist_uv[uv_tx];
+      skippable = skippable && skip_uv[uv_tx];
+      mbmi->uv_mode = mode_uv[uv_tx];
 
       rate2 = rate_y + x->mbmode_cost[cm->frame_type][mbmi->mode] + rate_uv;
       distortion2 = distortion_y + distortion_uv;
@@ -5488,7 +4704,7 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 #endif
       }
 #endif
-      this_rd = handle_inter_mode(cpi, x, block_size,
+      this_rd = handle_inter_mode(cpi, x, bsize,
                                   &saddone, near_sadidx, mdcounts, txfm_cache,
                                   &rate2, &distortion2, &skippable,
                                   &compmode_cost,
@@ -5770,22 +4986,6 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   return best_rd;
 }
 
-int64_t vp9_rd_pick_inter_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,
-                                    int mb_row, int mb_col,
-                                    int *returnrate,
-                                    int *returndistortion) {
-  return vp9_rd_pick_inter_mode_sb(cpi, x, mb_row, mb_col,
-                                   returnrate, returndistortion, BLOCK_32X32);
-}
-
-int64_t vp9_rd_pick_inter_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,
-                                    int mb_row, int mb_col,
-                                    int *returnrate,
-                                    int *returndistortion) {
-  return vp9_rd_pick_inter_mode_sb(cpi, x, mb_row, mb_col,
-                                   returnrate, returndistortion, BLOCK_64X64);
-}
-
 void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
                                     int mb_row, int mb_col,
                                     int *totalrate, int *totaldist) {
index d1b4777..5a5303c 100644 (file)
@@ -22,23 +22,16 @@ void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex);
 void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
                             int *r, int *d);
 
-void vp9_rd_pick_intra_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,
-                                 int *r, int *d);
-
-void vp9_rd_pick_intra_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,
-                                 int *r, int *d);
+void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+                               int *r, int *d, BLOCK_SIZE_TYPE bsize);
 
 void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
                                     int mb_row, int mb_col,
                                     int *r, int *d);
 
-int64_t vp9_rd_pick_inter_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,
-                                    int mb_row, int mb_col,
-                                    int *r, int *d);
-
-int64_t vp9_rd_pick_inter_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,
-                                    int mb_row, int mb_col,
-                                    int *r, int *d);
+int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+                                  int mb_row, int mb_col,
+                                  int *r, int *d, BLOCK_SIZE_TYPE bsize);
 
 void vp9_init_me_luts();
 
index 51314a7..734cb61 100644 (file)
@@ -123,140 +123,3 @@ sym(vp9_block_error_mmx):
     UNSHADOW_ARGS
     pop         rbp
     ret
-
-
-;int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr);
-global sym(vp9_mbblock_error_mmx_impl) PRIVATE
-sym(vp9_mbblock_error_mmx_impl):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov         rsi,        arg(0) ;coeff_ptr
-        pxor        mm7,        mm7
-
-        mov         rdi,        arg(1) ;dcoef_ptr
-        pxor        mm2,        mm2
-
-        mov         rcx,        16
-
-.mberror_loop_mmx:
-        movq        mm3,       [rsi]
-        movq        mm4,       [rdi]
-
-        movq        mm5,       [rsi+8]
-        movq        mm6,       [rdi+8]
-
-
-        psubw       mm5,        mm6
-        pmaddwd     mm5,        mm5
-
-        psubw       mm3,        mm4
-
-        pmaddwd     mm3,        mm3
-        paddd       mm2,        mm5
-
-        paddd       mm2,        mm3
-        movq        mm3,       [rsi+16]
-
-        movq        mm4,       [rdi+16]
-        movq        mm5,       [rsi+24]
-
-        movq        mm6,       [rdi+24]
-        psubw       mm5,        mm6
-
-        pmaddwd     mm5,        mm5
-        psubw       mm3,        mm4
-
-        pmaddwd     mm3,        mm3
-        paddd       mm2,        mm5
-
-        paddd       mm2,        mm3
-        add         rsi,        32
-
-        add         rdi,        32
-        sub         rcx,        1
-
-        jnz         .mberror_loop_mmx
-
-        movq        mm0,        mm2
-        psrlq       mm2,        32
-
-        paddd       mm0,        mm2
-        movq        rax,        mm0
-
-    pop rdi
-    pop rsi
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr);
-global sym(vp9_mbblock_error_xmm_impl) PRIVATE
-sym(vp9_mbblock_error_xmm_impl):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    SAVE_XMM 5
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov         rsi,        arg(0) ;coeff_ptr
-        pxor        xmm5,       xmm5
-
-        mov         rdi,        arg(1) ;dcoef_ptr
-        pxor        xmm4,       xmm4
-
-        mov         rcx,        16
-
-.mberror_loop:
-        movdqa      xmm0,       [rsi]
-        movdqa      xmm1,       [rdi]
-
-        movdqa      xmm2,       [rsi+16]
-        movdqa      xmm3,       [rdi+16]
-
-
-        psubw       xmm2,       xmm3
-        pmaddwd     xmm2,       xmm2
-
-        psubw       xmm0,       xmm1
-
-        pmaddwd     xmm0,       xmm0
-        add         rsi,        32
-
-        add         rdi,        32
-
-        sub         rcx,        1
-        paddd       xmm4,       xmm2
-
-        paddd       xmm4,       xmm0
-        jnz         .mberror_loop
-
-        movdqa      xmm0,       xmm4
-        punpckldq   xmm0,       xmm5
-
-        punpckhdq   xmm4,       xmm5
-        paddd       xmm0,       xmm4
-
-        movdqa      xmm1,       xmm0
-        psrldq      xmm0,       8
-
-        paddd       xmm0,       xmm1
-        movq        rax,        xmm0
-
-    pop rdi
-    pop rsi
-    ; begin epilog
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
index 9557af1..310f0d9 100644 (file)
@@ -23,13 +23,6 @@ void vp9_short_fdct8x4_mmx(short *input, short *output, int pitch) {
   vp9_short_fdct4x4_mmx(input + 4, output + 16, pitch);
 }
 
-int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr);
-int vp9_mbblock_error_mmx(MACROBLOCK *mb) {
-  short *coeff_ptr =  mb->block[0].coeff;
-  short *dcoef_ptr =  mb->e_mbd.plane[0].dqcoeff;
-  return vp9_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr);
-}
-
 void vp9_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
                              short *diff, unsigned char *predictor,
                              int pitch);
@@ -44,13 +37,6 @@ void vp9_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) {
 #endif
 
 #if HAVE_SSE2
-int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr);
-int vp9_mbblock_error_xmm(MACROBLOCK *mb) {
-  short *coeff_ptr =  mb->block[0].coeff;
-  short *dcoef_ptr =  mb->e_mbd.plane[0].dqcoeff;
-  return vp9_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr);
-}
-
 void vp9_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
                               short *diff, unsigned char *predictor,
                               int pitch);