src/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.c

   1 /*
   2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include <assert.h>
  12 #include <math.h>
  13
  14 #include "./vp9_rtcd.h"
  15
  16 #include "vpx_mem/vpx_mem.h"
  17
  18 #include "vp9/common/vp9_common.h"
  19 #include "vp9/common/vp9_entropy.h"
  20 #include "vp9/common/vp9_entropymode.h"
  21 #include "vp9/common/vp9_idct.h"
  22 #include "vp9/common/vp9_mvref_common.h"
  23 #include "vp9/common/vp9_pred_common.h"
  24 #include "vp9/common/vp9_quant_common.h"
  25 #include "vp9/common/vp9_reconinter.h"
  26 #include "vp9/common/vp9_reconintra.h"
  27 #include "vp9/common/vp9_seg_common.h"
  28 #include "vp9/common/vp9_systemdependent.h"
  29
  30 #include "vp9/encoder/vp9_cost.h"
  31 #include "vp9/encoder/vp9_encodemb.h"
  32 #include "vp9/encoder/vp9_encodemv.h"
  33 #include "vp9/encoder/vp9_encoder.h"
  34 #include "vp9/encoder/vp9_mcomp.h"
  35 #include "vp9/encoder/vp9_quantize.h"
  36 #include "vp9/encoder/vp9_ratectrl.h"
  37 #include "vp9/encoder/vp9_rd.h"
  38 #include "vp9/encoder/vp9_rdopt.h"
  39 #include "vp9/encoder/vp9_variance.h"
  40
  41 #define RD_THRESH_MAX_FACT 64
  42 #define RD_THRESH_INC      1
  43
  44 #define LAST_FRAME_MODE_MASK    0xFFEDCD60
  45 #define GOLDEN_FRAME_MODE_MASK  0xFFDA3BB0
  46 #define ALT_REF_MODE_MASK       0xFFC648D0
  47
  48 #define MIN_EARLY_TERM_INDEX    3
  49
  50 typedef struct {
  51   PREDICTION_MODE mode;
  52   MV_REFERENCE_FRAME ref_frame[2];
  53 } MODE_DEFINITION;
  54
  55 typedef struct {
  56   MV_REFERENCE_FRAME ref_frame[2];
  57 } REF_DEFINITION;
  58
  59 struct rdcost_block_args {
  60   MACROBLOCK *x;
  61   ENTROPY_CONTEXT t_above[16];
  62   ENTROPY_CONTEXT t_left[16];
  63   int rate;
  64   int64_t dist;
  65   int64_t sse;
  66   int this_rate;
  67   int64_t this_dist;
  68   int64_t this_sse;
  69   int64_t this_rd;
  70   int64_t best_rd;
  71   int skip;
  72   int use_fast_coef_costing;
  73   const scan_order *so;
  74 };
  75
  76 static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
  77   {NEARESTMV, {LAST_FRAME,   NONE}},
  78   {NEARESTMV, {ALTREF_FRAME, NONE}},
  79   {NEARESTMV, {GOLDEN_FRAME, NONE}},
  80
  81   {DC_PRED,   {INTRA_FRAME,  NONE}},
  82
  83   {NEWMV,     {LAST_FRAME,   NONE}},
  84   {NEWMV,     {ALTREF_FRAME, NONE}},
  85   {NEWMV,     {GOLDEN_FRAME, NONE}},
  86
  87   {NEARMV,    {LAST_FRAME,   NONE}},
  88   {NEARMV,    {ALTREF_FRAME, NONE}},
  89   {NEARESTMV, {LAST_FRAME,   ALTREF_FRAME}},
  90   {NEARESTMV, {GOLDEN_FRAME, ALTREF_FRAME}},
  91
  92   {TM_PRED,   {INTRA_FRAME,  NONE}},
  93
  94   {NEARMV,    {LAST_FRAME,   ALTREF_FRAME}},
  95   {NEWMV,     {LAST_FRAME,   ALTREF_FRAME}},
  96   {NEARMV,    {GOLDEN_FRAME, NONE}},
  97   {NEARMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
  98   {NEWMV,     {GOLDEN_FRAME, ALTREF_FRAME}},
  99
 100   {ZEROMV,    {LAST_FRAME,   NONE}},
 101   {ZEROMV,    {GOLDEN_FRAME, NONE}},
 102   {ZEROMV,    {ALTREF_FRAME, NONE}},
 103   {ZEROMV,    {LAST_FRAME,   ALTREF_FRAME}},
 104   {ZEROMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
 105
 106   {H_PRED,    {INTRA_FRAME,  NONE}},
 107   {V_PRED,    {INTRA_FRAME,  NONE}},
 108   {D135_PRED, {INTRA_FRAME,  NONE}},
 109   {D207_PRED, {INTRA_FRAME,  NONE}},
 110   {D153_PRED, {INTRA_FRAME,  NONE}},
 111   {D63_PRED,  {INTRA_FRAME,  NONE}},
 112   {D117_PRED, {INTRA_FRAME,  NONE}},
 113   {D45_PRED,  {INTRA_FRAME,  NONE}},
 114 };
 115
 116 static const REF_DEFINITION vp9_ref_order[MAX_REFS] = {
 117   {{LAST_FRAME,   NONE}},
 118   {{GOLDEN_FRAME, NONE}},
 119   {{ALTREF_FRAME, NONE}},
 120   {{LAST_FRAME,   ALTREF_FRAME}},
 121   {{GOLDEN_FRAME, ALTREF_FRAME}},
 122   {{INTRA_FRAME,  NONE}},
 123 };
 124
 125 static int raster_block_offset(BLOCK_SIZE plane_bsize,
 126                                int raster_block, int stride) {
 127   const int bw = b_width_log2(plane_bsize);
 128   const int y = 4 * (raster_block >> bw);
 129   const int x = 4 * (raster_block & ((1 << bw) - 1));
 130   return y * stride + x;
 131 }
 132 static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize,
 133                                           int raster_block, int16_t *base) {
 134   const int stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
 135   return base + raster_block_offset(plane_bsize, raster_block, stride);
 136 }
 137
 138 static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
 139                            int m, int n, int min_plane, int max_plane) {
 140   int i;
 141
 142   for (i = min_plane; i < max_plane; ++i) {
 143     struct macroblock_plane *const p = &x->plane[i];
 144     struct macroblockd_plane *const pd = &x->e_mbd.plane[i];
 145
 146     p->coeff    = ctx->coeff_pbuf[i][m];
 147     p->qcoeff   = ctx->qcoeff_pbuf[i][m];
 148     pd->dqcoeff = ctx->dqcoeff_pbuf[i][m];
 149     p->eobs     = ctx->eobs_pbuf[i][m];
 150
 151     ctx->coeff_pbuf[i][m]   = ctx->coeff_pbuf[i][n];
 152     ctx->qcoeff_pbuf[i][m]  = ctx->qcoeff_pbuf[i][n];
 153     ctx->dqcoeff_pbuf[i][m] = ctx->dqcoeff_pbuf[i][n];
 154     ctx->eobs_pbuf[i][m]    = ctx->eobs_pbuf[i][n];
 155
 156     ctx->coeff_pbuf[i][n]   = p->coeff;
 157     ctx->qcoeff_pbuf[i][n]  = p->qcoeff;
 158     ctx->dqcoeff_pbuf[i][n] = pd->dqcoeff;
 159     ctx->eobs_pbuf[i][n]    = p->eobs;
 160   }
 161 }
 162
 163 static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
 164                             MACROBLOCK *x, MACROBLOCKD *xd,
 165                             int *out_rate_sum, int64_t *out_dist_sum) {
 166   // Note our transform coeffs are 8 times an orthogonal transform.
 167   // Hence quantizer step is also 8 times. To get effective quantizer
 168   // we need to divide by 8 before sending to modeling function.
 169   int i;
 170   int64_t rate_sum = 0;
 171   int64_t dist_sum = 0;
 172   const int ref = xd->mi[0]->mbmi.ref_frame[0];
 173   unsigned int sse;
 174
 175   for (i = 0; i < MAX_MB_PLANE; ++i) {
 176     struct macroblock_plane *const p = &x->plane[i];
 177     struct macroblockd_plane *const pd = &xd->plane[i];
 178     const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
 179
 180     (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
 181                               pd->dst.buf, pd->dst.stride, &sse);
 182
 183     if (i == 0)
 184       x->pred_sse[ref] = sse;
 185
 186     // Fast approximate the modelling function.
 187     if (cpi->oxcf.speed > 4) {
 188       int64_t rate;
 189       int64_t dist;
 190       int64_t square_error = sse;
 191       int quantizer = (pd->dequant[1] >> 3);
 192
 193       if (quantizer < 120)
 194         rate = (square_error * (280 - quantizer)) >> 8;
 195       else
 196         rate = 0;
 197       dist = (square_error * quantizer) >> 8;
 198       rate_sum += rate;
 199       dist_sum += dist;
 200     } else {
 201       int rate;
 202       int64_t dist;
 203       vp9_model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs],
 204                                    pd->dequant[1] >> 3, &rate, &dist);
 205       rate_sum += rate;
 206       dist_sum += dist;
 207     }
 208   }
 209
 210   *out_rate_sum = (int)rate_sum;
 211   *out_dist_sum = dist_sum << 4;
 212 }
 213
 214 int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff,
 215                           intptr_t block_size, int64_t *ssz) {
 216   int i;
 217   int64_t error = 0, sqcoeff = 0;
 218
 219   for (i = 0; i < block_size; i++) {
 220     const int diff = coeff[i] - dqcoeff[i];
 221     error +=  diff * diff;
 222     sqcoeff += coeff[i] * coeff[i];
 223   }
 224
 225   *ssz = sqcoeff;
 226   return error;
 227 }
 228
 229 /* The trailing '0' is a terminator which is used inside cost_coeffs() to
 230  * decide whether to include cost of a trailing EOB node or not (i.e. we
 231  * can skip this if the last coefficient in this transform block, e.g. the
 232  * 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block,
 233  * were non-zero). */
 234 static const int16_t band_counts[TX_SIZES][8] = {
 235   { 1, 2, 3, 4,  3,   16 - 13, 0 },
 236   { 1, 2, 3, 4, 11,   64 - 21, 0 },
 237   { 1, 2, 3, 4, 11,  256 - 21, 0 },
 238   { 1, 2, 3, 4, 11, 1024 - 21, 0 },
 239 };
 240 static INLINE int cost_coeffs(MACROBLOCK *x,
 241                               int plane, int block,
 242                               ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
 243                               TX_SIZE tx_size,
 244                               const int16_t *scan, const int16_t *nb,
 245                               int use_fast_coef_costing) {
 246   MACROBLOCKD *const xd = &x->e_mbd;
 247   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
 248   const struct macroblock_plane *p = &x->plane[plane];
 249   const struct macroblockd_plane *pd = &xd->plane[plane];
 250   const PLANE_TYPE type = pd->plane_type;
 251   const int16_t *band_count = &band_counts[tx_size][1];
 252   const int eob = p->eobs[block];
 253   const int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
 254   unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
 255                    x->token_costs[tx_size][type][is_inter_block(mbmi)];
 256   uint8_t token_cache[32 * 32];
 257   int pt = combine_entropy_contexts(*A, *L);
 258   int c, cost;
 259   // Check for consistency of tx_size with mode info
 260   assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size
 261                               : get_uv_tx_size(mbmi, pd) == tx_size);
 262
 263   if (eob == 0) {
 264     // single eob token
 265     cost = token_costs[0][0][pt][EOB_TOKEN];
 266     c = 0;
 267   } else {
 268     int band_left = *band_count++;
 269
 270     // dc token
 271     int v = qcoeff[0];
 272     int prev_t = vp9_dct_value_tokens_ptr[v].token;
 273     cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
 274     token_cache[0] = vp9_pt_energy_class[prev_t];
 275     ++token_costs;
 276
 277     // ac tokens
 278     for (c = 1; c < eob; c++) {
 279       const int rc = scan[c];
 280       int t;
 281
 282       v = qcoeff[rc];
 283       t = vp9_dct_value_tokens_ptr[v].token;
 284       if (use_fast_coef_costing) {
 285         cost += (*token_costs)[!prev_t][!prev_t][t] + vp9_dct_value_cost_ptr[v];
 286       } else {
 287         pt = get_coef_context(nb, token_cache, c);
 288         cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v];
 289         token_cache[rc] = vp9_pt_energy_class[t];
 290       }
 291       prev_t = t;
 292       if (!--band_left) {
 293         band_left = *band_count++;
 294         ++token_costs;
 295       }
 296     }
 297
 298     // eob token
 299     if (band_left) {
 300       if (use_fast_coef_costing) {
 301         cost += (*token_costs)[0][!prev_t][EOB_TOKEN];
 302       } else {
 303         pt = get_coef_context(nb, token_cache, c);
 304         cost += (*token_costs)[0][pt][EOB_TOKEN];
 305       }
 306     }
 307   }
 308
 309   // is eob first coefficient;
 310   *A = *L = (c > 0);
 311
 312   return cost;
 313 }
 314 static void dist_block(int plane, int block, TX_SIZE tx_size,
 315                        struct rdcost_block_args* args) {
 316   const int ss_txfrm_size = tx_size << 1;
 317   MACROBLOCK* const x = args->x;
 318   MACROBLOCKD* const xd = &x->e_mbd;
 319   const struct macroblock_plane *const p = &x->plane[plane];
 320   const struct macroblockd_plane *const pd = &xd->plane[plane];
 321   int64_t this_sse;
 322   int shift = tx_size == TX_32X32 ? 0 : 2;
 323   int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
 324   int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
 325   args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
 326                                &this_sse) >> shift;
 327   args->sse  = this_sse >> shift;
 328
 329   if (x->skip_encode && !is_inter_block(&xd->mi[0]->mbmi)) {
 330     // TODO(jingning): tune the model to better capture the distortion.
 331     int64_t p = (pd->dequant[1] * pd->dequant[1] *
 332                     (1 << ss_txfrm_size)) >> (shift + 2);
 333     args->dist += (p >> 4);
 334     args->sse  += p;
 335   }
 336 }
 337
 338 static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
 339                        TX_SIZE tx_size, struct rdcost_block_args* args) {
 340   int x_idx, y_idx;
 341   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x_idx, &y_idx);
 342
 343   args->rate = cost_coeffs(args->x, plane, block, args->t_above + x_idx,
 344                            args->t_left + y_idx, tx_size,
 345                            args->so->scan, args->so->neighbors,
 346                            args->use_fast_coef_costing);
 347 }
 348
 349 static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
 350                           TX_SIZE tx_size, void *arg) {
 351   struct rdcost_block_args *args = arg;
 352   MACROBLOCK *const x = args->x;
 353   MACROBLOCKD *const xd = &x->e_mbd;
 354   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
 355   int64_t rd1, rd2, rd;
 356
 357   if (args->skip)
 358     return;
 359
 360   if (!is_inter_block(mbmi))
 361     vp9_encode_block_intra(x, plane, block, plane_bsize, tx_size, &mbmi->skip);
 362   else
 363     vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
 364
 365   dist_block(plane, block, tx_size, args);
 366   rate_block(plane, block, plane_bsize, tx_size, args);
 367   rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist);
 368   rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse);
 369
 370   // TODO(jingning): temporarily enabled only for luma component
 371   rd = MIN(rd1, rd2);
 372   if (plane == 0)
 373     x->zcoeff_blk[tx_size][block] = !x->plane[plane].eobs[block] ||
 374                                     (rd1 > rd2 && !xd->lossless);
 375
 376   args->this_rate += args->rate;
 377   args->this_dist += args->dist;
 378   args->this_sse  += args->sse;
 379   args->this_rd += rd;
 380
 381   if (args->this_rd > args->best_rd) {
 382     args->skip = 1;
 383     return;
 384   }
 385 }
 386
 387 static void txfm_rd_in_plane(MACROBLOCK *x,
 388                              int *rate, int64_t *distortion,
 389                              int *skippable, int64_t *sse,
 390                              int64_t ref_best_rd, int plane,
 391                              BLOCK_SIZE bsize, TX_SIZE tx_size,
 392                              int use_fast_coef_casting) {
 393   MACROBLOCKD *const xd = &x->e_mbd;
 394   const struct macroblockd_plane *const pd = &xd->plane[plane];
 395   struct rdcost_block_args args;
 396   vp9_zero(args);
 397   args.x = x;
 398   args.best_rd = ref_best_rd;
 399   args.use_fast_coef_costing = use_fast_coef_casting;
 400
 401   if (plane == 0)
 402     xd->mi[0]->mbmi.tx_size = tx_size;
 403
 404   vp9_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
 405
 406   args.so = get_scan(xd, tx_size, pd->plane_type, 0);
 407
 408   vp9_foreach_transformed_block_in_plane(xd, bsize, plane,
 409                                          block_rd_txfm, &args);
 410   if (args.skip) {
 411     *rate       = INT_MAX;
 412     *distortion = INT64_MAX;
 413     *sse        = INT64_MAX;
 414     *skippable  = 0;
 415   } else {
 416     *distortion = args.this_dist;
 417     *rate       = args.this_rate;
 418     *sse        = args.this_sse;
 419     *skippable  = vp9_is_skippable_in_plane(x, bsize, plane);
 420   }
 421 }
 422
 423 static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x,
 424                                    int *rate, int64_t *distortion,
 425                                    int *skip, int64_t *sse,
 426                                    int64_t ref_best_rd,
 427                                    BLOCK_SIZE bs) {
 428   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
 429   VP9_COMMON *const cm = &cpi->common;
 430   const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
 431   MACROBLOCKD *const xd = &x->e_mbd;
 432   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
 433
 434   mbmi->tx_size = MIN(max_tx_size, largest_tx_size);
 435
 436   txfm_rd_in_plane(x, rate, distortion, skip,
 437                    &sse[mbmi->tx_size], ref_best_rd, 0, bs,
 438                    mbmi->tx_size, cpi->sf.use_fast_coef_costing);
 439   cpi->tx_stepdown_count[0]++;
 440 }
 441
 442 static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
 443                                    int (*r)[2], int *rate,
 444                                    int64_t *d, int64_t *distortion,
 445                                    int *s, int *skip,
 446                                    int64_t tx_cache[TX_MODES],
 447                                    BLOCK_SIZE bs) {
 448   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
 449   VP9_COMMON *const cm = &cpi->common;
 450   MACROBLOCKD *const xd = &x->e_mbd;
 451   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
 452   vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
 453   int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX},
 454                              {INT64_MAX, INT64_MAX},
 455                              {INT64_MAX, INT64_MAX},
 456                              {INT64_MAX, INT64_MAX}};
 457   TX_SIZE n, m;
 458   int s0, s1;
 459   const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
 460   int64_t best_rd = INT64_MAX;
 461   TX_SIZE best_tx = TX_4X4;
 462
 463   const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
 464   assert(skip_prob > 0);
 465   s0 = vp9_cost_bit(skip_prob, 0);
 466   s1 = vp9_cost_bit(skip_prob, 1);
 467
 468   for (n = TX_4X4; n <= max_tx_size; n++) {
 469     r[n][1] = r[n][0];
 470     if (r[n][0] < INT_MAX) {
 471       for (m = 0; m <= n - (n == max_tx_size); m++) {
 472         if (m == n)
 473           r[n][1] += vp9_cost_zero(tx_probs[m]);
 474         else
 475           r[n][1] += vp9_cost_one(tx_probs[m]);
 476       }
 477     }
 478     if (d[n] == INT64_MAX) {
 479       rd[n][0] = rd[n][1] = INT64_MAX;
 480     } else if (s[n]) {
 481       rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
 482     } else {
 483       rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
 484       rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
 485     }
 486
 487     if (rd[n][1] < best_rd) {
 488       best_tx = n;
 489       best_rd = rd[n][1];
 490     }
 491   }
 492   mbmi->tx_size = cm->tx_mode == TX_MODE_SELECT ?
 493                       best_tx : MIN(max_tx_size, max_mode_tx_size);
 494
 495
 496   *distortion = d[mbmi->tx_size];
 497   *rate       = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT];
 498   *skip       = s[mbmi->tx_size];
 499
 500   tx_cache[ONLY_4X4] = rd[TX_4X4][0];
 501   tx_cache[ALLOW_8X8] = rd[TX_8X8][0];
 502   tx_cache[ALLOW_16X16] = rd[MIN(max_tx_size, TX_16X16)][0];
 503   tx_cache[ALLOW_32X32] = rd[MIN(max_tx_size, TX_32X32)][0];
 504
 505   if (max_tx_size == TX_32X32 && best_tx == TX_32X32) {
 506     tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
 507     cpi->tx_stepdown_count[0]++;
 508   } else if (max_tx_size >= TX_16X16 && best_tx == TX_16X16) {
 509     tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
 510     cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
 511   } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {
 512     tx_cache[TX_MODE_SELECT] = rd[TX_8X8][1];
 513     cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
 514   } else {
 515     tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1];
 516     cpi->tx_stepdown_count[max_tx_size - TX_4X4]++;
 517   }
 518 }
 519
 520 static void inter_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
 521                                   int64_t *distortion, int *skip,
 522                                   int64_t *psse, BLOCK_SIZE bs,
 523                                   int64_t txfm_cache[TX_MODES],
 524                                   int64_t ref_best_rd) {
 525   int r[TX_SIZES][2], s[TX_SIZES];
 526   int64_t d[TX_SIZES], sse[TX_SIZES];
 527   MACROBLOCKD *xd = &x->e_mbd;
 528   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
 529   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
 530   TX_SIZE tx_size;
 531
 532   assert(bs == mbmi->sb_type);
 533
 534   vp9_subtract_plane(x, bs, 0);
 535
 536   if (cpi->sf.tx_size_search_method == USE_LARGESTALL || xd->lossless) {
 537     vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
 538     choose_largest_tx_size(cpi, x, rate, distortion, skip, sse, ref_best_rd,
 539                            bs);
 540     if (psse)
 541       *psse = sse[mbmi->tx_size];
 542     return;
 543   }
 544
 545   for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
 546     txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size], &s[tx_size],
 547                      &sse[tx_size], ref_best_rd, 0, bs, tx_size,
 548                      cpi->sf.use_fast_coef_costing);
 549   choose_tx_size_from_rd(cpi, x, r, rate, d, distortion, s,
 550                          skip, txfm_cache, bs);
 551
 552   if (psse)
 553     *psse = sse[mbmi->tx_size];
 554 }
 555
 556 static void intra_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
 557                                   int64_t *distortion, int *skip,
 558                                   int64_t *psse, BLOCK_SIZE bs,
 559                                   int64_t txfm_cache[TX_MODES],
 560                                   int64_t ref_best_rd) {
 561   int64_t sse[TX_SIZES];
 562   MACROBLOCKD *xd = &x->e_mbd;
 563   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
 564
 565   assert(bs == mbmi->sb_type);
 566   if (cpi->sf.tx_size_search_method != USE_FULL_RD || xd->lossless) {
 567     vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
 568     choose_largest_tx_size(cpi, x, rate, distortion, skip, sse, ref_best_rd,
 569                            bs);
 570   } else {
 571     int r[TX_SIZES][2], s[TX_SIZES];
 572     int64_t d[TX_SIZES];
 573     TX_SIZE tx_size;
 574     for (tx_size = TX_4X4; tx_size <= max_txsize_lookup[bs]; ++tx_size)
 575       txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size],
 576                        &s[tx_size], &sse[tx_size],
 577                        ref_best_rd, 0, bs, tx_size,
 578                        cpi->sf.use_fast_coef_costing);
 579     choose_tx_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache,
 580                            bs);
 581   }
 582   if (psse)
 583     *psse = sse[mbmi->tx_size];
 584 }
 585
 586
 587 static int conditional_skipintra(PREDICTION_MODE mode,
 588                                  PREDICTION_MODE best_intra_mode) {
 589   if (mode == D117_PRED &&
 590       best_intra_mode != V_PRED &&
 591       best_intra_mode != D135_PRED)
 592     return 1;
 593   if (mode == D63_PRED &&
 594       best_intra_mode != V_PRED &&
 595       best_intra_mode != D45_PRED)
 596     return 1;
 597   if (mode == D207_PRED &&
 598       best_intra_mode != H_PRED &&
 599       best_intra_mode != D45_PRED)
 600     return 1;
 601   if (mode == D153_PRED &&
 602       best_intra_mode != H_PRED &&
 603       best_intra_mode != D135_PRED)
 604     return 1;
 605   return 0;
 606 }
 607
 608 static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
 609                                      PREDICTION_MODE *best_mode,
 610                                      const int *bmode_costs,
 611                                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
 612                                      int *bestrate, int *bestratey,
 613                                      int64_t *bestdistortion,
 614                                      BLOCK_SIZE bsize, int64_t rd_thresh) {
 615   PREDICTION_MODE mode;
 616   MACROBLOCKD *const xd = &x->e_mbd;
 617   int64_t best_rd = rd_thresh;
 618
 619   struct macroblock_plane *p = &x->plane[0];
 620   struct macroblockd_plane *pd = &xd->plane[0];
 621   const int src_stride = p->src.stride;
 622   const int dst_stride = pd->dst.stride;
 623   const uint8_t *src_init = &p->src.buf[raster_block_offset(BLOCK_8X8, ib,
 624                                                             src_stride)];
 625   uint8_t *dst_init = &pd->dst.buf[raster_block_offset(BLOCK_8X8, ib,
 626                                                        dst_stride)];
 627   ENTROPY_CONTEXT ta[2], tempa[2];
 628   ENTROPY_CONTEXT tl[2], templ[2];
 629
 630   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
 631   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
 632   int idx, idy;
 633   uint8_t best_dst[8 * 8];
 634
 635   assert(ib < 4);
 636
 637   vpx_memcpy(ta, a, sizeof(ta));
 638   vpx_memcpy(tl, l, sizeof(tl));
 639   xd->mi[0]->mbmi.tx_size = TX_4X4;
 640
 641   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
 642     int64_t this_rd;
 643     int ratey = 0;
 644     int64_t distortion = 0;
 645     int rate = bmode_costs[mode];
 646
 647     if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode)))
 648       continue;
 649
 650     // Only do the oblique modes if the best so far is
 651     // one of the neighboring directional modes
 652     if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
 653       if (conditional_skipintra(mode, *best_mode))
 654           continue;
 655     }
 656
 657     vpx_memcpy(tempa, ta, sizeof(ta));
 658     vpx_memcpy(templ, tl, sizeof(tl));
 659
 660     for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
 661       for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
 662         const int block = ib + idy * 2 + idx;
 663         const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
 664         uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
 665         int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block,
 666                                                             p->src_diff);
 667         int16_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
 668         xd->mi[0]->bmi[block].as_mode = mode;
 669         vp9_predict_intra_block(xd, block, 1,
 670                                 TX_4X4, mode,
 671                                 x->skip_encode ? src : dst,
 672                                 x->skip_encode ? src_stride : dst_stride,
 673                                 dst, dst_stride, idx, idy, 0);
 674         vp9_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
 675
 676         if (xd->lossless) {
 677           const scan_order *so = &vp9_default_scan_orders[TX_4X4];
 678           vp9_fwht4x4(src_diff, coeff, 8);
 679           vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
 680           ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
 681                                so->scan, so->neighbors,
 682                                cpi->sf.use_fast_coef_costing);
 683           if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
 684             goto next;
 685           vp9_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, dst_stride,
 686                           p->eobs[block]);
 687         } else {
 688           int64_t unused;
 689           const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
 690           const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
 691           vp9_fht4x4(src_diff, coeff, 8, tx_type);
 692           vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
 693           ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
 694                              so->scan, so->neighbors,
 695                              cpi->sf.use_fast_coef_costing);
 696           distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
 697                                         16, &unused) >> 2;
 698           if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
 699             goto next;
 700           vp9_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block),
 701                          dst, dst_stride, p->eobs[block]);
 702         }
 703       }
 704     }
 705
 706     rate += ratey;
 707     this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
 708
 709     if (this_rd < best_rd) {
 710       *bestrate = rate;
 711       *bestratey = ratey;
 712       *bestdistortion = distortion;
 713       best_rd = this_rd;
 714       *best_mode = mode;
 715       vpx_memcpy(a, tempa, sizeof(tempa));
 716       vpx_memcpy(l, templ, sizeof(templ));
 717       for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
 718         vpx_memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
 719                    num_4x4_blocks_wide * 4);
 720     }
 721   next:
 722     {}
 723   }
 724
 725   if (best_rd >= rd_thresh || x->skip_encode)
 726     return best_rd;
 727
 728   for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
 729     vpx_memcpy(dst_init + idy * dst_stride, best_dst + idy * 8,
 730                num_4x4_blocks_wide * 4);
 731
 732   return best_rd;
 733 }
 734
 735 static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP *cpi, MACROBLOCK *mb,
 736                                             int *rate, int *rate_y,
 737                                             int64_t *distortion,
 738                                             int64_t best_rd) {
 739   int i, j;
 740   const MACROBLOCKD *const xd = &mb->e_mbd;
 741   MODE_INFO *const mic = xd->mi[0];
 742   const MODE_INFO *above_mi = xd->mi[-xd->mi_stride];
 743   const MODE_INFO *left_mi = xd->left_available ? xd->mi[-1] : NULL;
 744   const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
 745   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
 746   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
 747   int idx, idy;
 748   int cost = 0;
 749   int64_t total_distortion = 0;
 750   int tot_rate_y = 0;
 751   int64_t total_rd = 0;
 752   ENTROPY_CONTEXT t_above[4], t_left[4];
 753   const int *bmode_costs = cpi->mbmode_cost;
 754
 755   vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
 756   vpx_memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
 757
 758   // Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block.
 759   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
 760     for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
 761       PREDICTION_MODE best_mode = DC_PRED;
 762       int r = INT_MAX, ry = INT_MAX;
 763       int64_t d = INT64_MAX, this_rd = INT64_MAX;
 764       i = idy * 2 + idx;
 765       if (cpi->common.frame_type == KEY_FRAME) {
 766         const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, i);
 767         const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, i);
 768
 769         bmode_costs  = cpi->y_mode_costs[A][L];
 770       }
 771
 772       this_rd = rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
 773                                       t_above + idx, t_left + idy, &r, &ry, &d,
 774                                       bsize, best_rd - total_rd);
 775       if (this_rd >= best_rd - total_rd)
 776         return INT64_MAX;
 777
 778       total_rd += this_rd;
 779       cost += r;
 780       total_distortion += d;
 781       tot_rate_y += ry;
 782
 783       mic->bmi[i].as_mode = best_mode;
 784       for (j = 1; j < num_4x4_blocks_high; ++j)
 785         mic->bmi[i + j * 2].as_mode = best_mode;
 786       for (j = 1; j < num_4x4_blocks_wide; ++j)
 787         mic->bmi[i + j].as_mode = best_mode;
 788
 789       if (total_rd >= best_rd)
 790         return INT64_MAX;
 791     }
 792   }
 793
 794   *rate = cost;
 795   *rate_y = tot_rate_y;
 796   *distortion = total_distortion;
 797   mic->mbmi.mode = mic->bmi[3].as_mode;
 798
 799   return RDCOST(mb->rdmult, mb->rddiv, cost, total_distortion);
 800 }
 801
 802 static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
 803                                       int *rate, int *rate_tokenonly,
 804                                       int64_t *distortion, int *skippable,
 805                                       BLOCK_SIZE bsize,
 806                                       int64_t tx_cache[TX_MODES],
 807                                       int64_t best_rd) {
 808   PREDICTION_MODE mode;
 809   PREDICTION_MODE mode_selected = DC_PRED;
 810   MACROBLOCKD *const xd = &x->e_mbd;
 811   MODE_INFO *const mic = xd->mi[0];
 812   int this_rate, this_rate_tokenonly, s;
 813   int64_t this_distortion, this_rd;
 814   TX_SIZE best_tx = TX_4X4;
 815   int i;
 816   int *bmode_costs = cpi->mbmode_cost;
 817
 818   if (cpi->sf.tx_size_search_method == USE_FULL_RD)
 819     for (i = 0; i < TX_MODES; i++)
 820       tx_cache[i] = INT64_MAX;
 821
 822   /* Y Search for intra prediction mode */
 823   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
 824     int64_t local_tx_cache[TX_MODES];
 825     MODE_INFO *above_mi = xd->mi[-xd->mi_stride];
 826     MODE_INFO *left_mi = xd->left_available ? xd->mi[-1] : NULL;
 827
 828     if (cpi->common.frame_type == KEY_FRAME) {
 829       const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
 830       const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
 831
 832       bmode_costs = cpi->y_mode_costs[A][L];
 833     }
 834     mic->mbmi.mode = mode;
 835
 836     intra_super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
 837         &s, NULL, bsize, local_tx_cache, best_rd);
 838
 839     if (this_rate_tokenonly == INT_MAX)
 840       continue;
 841
 842     this_rate = this_rate_tokenonly + bmode_costs[mode];
 843     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
 844
 845     if (this_rd < best_rd) {
 846       mode_selected   = mode;
 847       best_rd         = this_rd;
 848       best_tx         = mic->mbmi.tx_size;
 849       *rate           = this_rate;
 850       *rate_tokenonly = this_rate_tokenonly;
 851       *distortion     = this_distortion;
 852       *skippable      = s;
 853     }
 854
 855     if (cpi->sf.tx_size_search_method == USE_FULL_RD && this_rd < INT64_MAX) {
 856       for (i = 0; i < TX_MODES && local_tx_cache[i] < INT64_MAX; i++) {
 857         const int64_t adj_rd = this_rd + local_tx_cache[i] -
 858             local_tx_cache[cpi->common.tx_mode];
 859         if (adj_rd < tx_cache[i]) {
 860           tx_cache[i] = adj_rd;
 861         }
 862       }
 863     }
 864   }
 865
 866   mic->mbmi.mode = mode_selected;
 867   mic->mbmi.tx_size = best_tx;
 868
 869   return best_rd;
 870 }
 871
 872 static void super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x,
 873                              int *rate, int64_t *distortion, int *skippable,
 874                              int64_t *sse, BLOCK_SIZE bsize,
 875                              int64_t ref_best_rd) {
 876   MACROBLOCKD *const xd = &x->e_mbd;
 877   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
 878   const TX_SIZE uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]);
 879   int plane;
 880   int pnrate = 0, pnskip = 1;
 881   int64_t pndist = 0, pnsse = 0;
 882
 883   if (ref_best_rd < 0)
 884     goto term;
 885
 886   if (is_inter_block(mbmi)) {
 887     int plane;
 888     for (plane = 1; plane < MAX_MB_PLANE; ++plane)
 889       vp9_subtract_plane(x, bsize, plane);
 890   }
 891
 892   *rate = 0;
 893   *distortion = 0;
 894   *sse = 0;
 895   *skippable = 1;
 896
 897   for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
 898     txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse,
 899                      ref_best_rd, plane, bsize, uv_tx_size,
 900                      cpi->sf.use_fast_coef_costing);
 901     if (pnrate == INT_MAX)
 902       goto term;
 903     *rate += pnrate;
 904     *distortion += pndist;
 905     *sse += pnsse;
 906     *skippable &= pnskip;
 907   }
 908   return;
 909
 910   term:
 911   *rate = INT_MAX;
 912   *distortion = INT64_MAX;
 913   *sse = INT64_MAX;
 914   *skippable = 0;
 915   return;
 916 }
 917
 918 static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
 919                                        PICK_MODE_CONTEXT *ctx,
 920                                        int *rate, int *rate_tokenonly,
 921                                        int64_t *distortion, int *skippable,
 922                                        BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
 923   MACROBLOCKD *xd = &x->e_mbd;
 924   PREDICTION_MODE mode;
 925   PREDICTION_MODE mode_selected = DC_PRED;
 926   int64_t best_rd = INT64_MAX, this_rd;
 927   int this_rate_tokenonly, this_rate, s;
 928   int64_t this_distortion, this_sse;
 929
 930   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
 931     if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode)))
 932       continue;
 933
 934     xd->mi[0]->mbmi.uv_mode = mode;
 935
 936     super_block_uvrd(cpi, x, &this_rate_tokenonly,
 937                      &this_distortion, &s, &this_sse, bsize, best_rd);
 938     if (this_rate_tokenonly == INT_MAX)
 939       continue;
 940     this_rate = this_rate_tokenonly +
 941                 cpi->intra_uv_mode_cost[cpi->common.frame_type][mode];
 942     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
 943
 944     if (this_rd < best_rd) {
 945       mode_selected   = mode;
 946       best_rd         = this_rd;
 947       *rate           = this_rate;
 948       *rate_tokenonly = this_rate_tokenonly;
 949       *distortion     = this_distortion;
 950       *skippable      = s;
 951       if (!x->select_tx_size)
 952         swap_block_ptr(x, ctx, 2, 0, 1, MAX_MB_PLANE);
 953     }
 954   }
 955
 956   xd->mi[0]->mbmi.uv_mode = mode_selected;
 957   return best_rd;
 958 }
 959
 960 static int64_t rd_sbuv_dcpred(const VP9_COMP *cpi, MACROBLOCK *x,
 961                               int *rate, int *rate_tokenonly,
 962                               int64_t *distortion, int *skippable,
 963                               BLOCK_SIZE bsize) {
 964   const VP9_COMMON *cm = &cpi->common;
 965   int64_t unused;
 966
 967   x->e_mbd.mi[0]->mbmi.uv_mode = DC_PRED;
 968   super_block_uvrd(cpi, x, rate_tokenonly, distortion,
 969                    skippable, &unused, bsize, INT64_MAX);
 970   *rate = *rate_tokenonly + cpi->intra_uv_mode_cost[cm->frame_type][DC_PRED];
 971   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 972 }
 973
 974 static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
 975                                  BLOCK_SIZE bsize, TX_SIZE max_tx_size,
 976                                  int *rate_uv, int *rate_uv_tokenonly,
 977                                  int64_t *dist_uv, int *skip_uv,
 978                                  PREDICTION_MODE *mode_uv) {
 979   MACROBLOCK *const x = &cpi->mb;
 980
 981   // Use an estimated rd for uv_intra based on DC_PRED if the
 982   // appropriate speed flag is set.
 983   if (cpi->sf.use_uv_intra_rd_estimate) {
 984     rd_sbuv_dcpred(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv,
 985                    skip_uv, bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
 986   // Else do a proper rd search for each possible transform size that may
 987   // be considered in the main rd loop.
 988   } else {
 989     rd_pick_intra_sbuv_mode(cpi, x, ctx,
 990                             rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
 991                             bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize, max_tx_size);
 992   }
 993   *mode_uv = x->e_mbd.mi[0]->mbmi.uv_mode;
 994 }
 995
 996 static int cost_mv_ref(const VP9_COMP *cpi, PREDICTION_MODE mode,
 997                        int mode_context) {
 998   assert(is_inter_mode(mode));
 999   return cpi->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
1000 }
1001
1002 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
1003                                 BLOCK_SIZE bsize,
1004                                 int_mv *frame_mv,
1005                                 int mi_row, int mi_col,
1006                                 int_mv single_newmv[MAX_REF_FRAMES],
1007                                 int *rate_mv);
1008
1009 static int set_and_cost_bmi_mvs(VP9_COMP *cpi, MACROBLOCKD *xd, int i,
1010                                 PREDICTION_MODE mode, int_mv this_mv[2],
1011                                 int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
1012                                 int_mv seg_mvs[MAX_REF_FRAMES],
1013                                 int_mv *best_ref_mv[2], const int *mvjcost,
1014                                 int *mvcost[2]) {
1015   MODE_INFO *const mic = xd->mi[0];
1016   const MB_MODE_INFO *const mbmi = &mic->mbmi;
1017   int thismvcost = 0;
1018   int idx, idy;
1019   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
1020   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
1021   const int is_compound = has_second_ref(mbmi);
1022
1023   switch (mode) {
1024     case NEWMV:
1025       this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
1026       thismvcost += vp9_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
1027                                     mvjcost, mvcost, MV_COST_WEIGHT_SUB);
1028       if (is_compound) {
1029         this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
1030         thismvcost += vp9_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv,
1031                                       mvjcost, mvcost, MV_COST_WEIGHT_SUB);
1032       }
1033       break;
1034     case NEARMV:
1035     case NEARESTMV:
1036       this_mv[0].as_int = frame_mv[mode][mbmi->ref_frame[0]].as_int;
1037       if (is_compound)
1038         this_mv[1].as_int = frame_mv[mode][mbmi->ref_frame[1]].as_int;
1039       break;
1040     case ZEROMV:
1041       this_mv[0].as_int = 0;
1042       if (is_compound)
1043         this_mv[1].as_int = 0;
1044       break;
1045     default:
1046       break;
1047   }
1048
1049   mic->bmi[i].as_mv[0].as_int = this_mv[0].as_int;
1050   if (is_compound)
1051     mic->bmi[i].as_mv[1].as_int = this_mv[1].as_int;
1052
1053   mic->bmi[i].as_mode = mode;
1054
1055   for (idy = 0; idy < num_4x4_blocks_high; ++idy)
1056     for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
1057       vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
1058                  &mic->bmi[i], sizeof(mic->bmi[i]));
1059
1060   return cost_mv_ref(cpi, mode, mbmi->mode_context[mbmi->ref_frame[0]]) +
1061             thismvcost;
1062 }
1063
1064 static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
1065                                        MACROBLOCK *x,
1066                                        int64_t best_yrd,
1067                                        int i,
1068                                        int *labelyrate,
1069                                        int64_t *distortion, int64_t *sse,
1070                                        ENTROPY_CONTEXT *ta,
1071                                        ENTROPY_CONTEXT *tl,
1072                                        int mi_row, int mi_col) {
1073   int k;
1074   MACROBLOCKD *xd = &x->e_mbd;
1075   struct macroblockd_plane *const pd = &xd->plane[0];
1076   struct macroblock_plane *const p = &x->plane[0];
1077   MODE_INFO *const mi = xd->mi[0];
1078   const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
1079   const int width = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
1080   const int height = 4 * num_4x4_blocks_high_lookup[plane_bsize];
1081   int idx, idy;
1082
1083   const uint8_t *const src = &p->src.buf[raster_block_offset(BLOCK_8X8, i,
1084                                                              p->src.stride)];
1085   uint8_t *const dst = &pd->dst.buf[raster_block_offset(BLOCK_8X8, i,
1086                                                         pd->dst.stride)];
1087   int64_t thisdistortion = 0, thissse = 0;
1088   int thisrate = 0, ref;
1089   const scan_order *so = &vp9_default_scan_orders[TX_4X4];
1090   const int is_compound = has_second_ref(&mi->mbmi);
1091   const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
1092
1093   for (ref = 0; ref < 1 + is_compound; ++ref) {
1094     const uint8_t *pre = &pd->pre[ref].buf[raster_block_offset(BLOCK_8X8, i,
1095                                                pd->pre[ref].stride)];
1096     vp9_build_inter_predictor(pre, pd->pre[ref].stride,
1097                               dst, pd->dst.stride,
1098                               &mi->bmi[i].as_mv[ref].as_mv,
1099                               &xd->block_refs[ref]->sf, width, height, ref,
1100                               kernel, MV_PRECISION_Q3,
1101                               mi_col * MI_SIZE + 4 * (i % 2),
1102                               mi_row * MI_SIZE + 4 * (i / 2));
1103   }
1104
1105   vp9_subtract_block(height, width,
1106                      raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
1107                      src, p->src.stride,
1108                      dst, pd->dst.stride);
1109
1110   k = i;
1111   for (idy = 0; idy < height / 4; ++idy) {
1112     for (idx = 0; idx < width / 4; ++idx) {
1113       int64_t ssz, rd, rd1, rd2;
1114       int16_t* coeff;
1115
1116       k += (idy * 2 + idx);
1117       coeff = BLOCK_OFFSET(p->coeff, k);
1118       x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
1119                     coeff, 8);
1120       vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
1121       thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
1122                                         16, &ssz);
1123       thissse += ssz;
1124       thisrate += cost_coeffs(x, 0, k, ta + (k & 1), tl + (k >> 1), TX_4X4,
1125                               so->scan, so->neighbors,
1126                               cpi->sf.use_fast_coef_costing);
1127       rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion >> 2);
1128       rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse >> 2);
1129       rd = MIN(rd1, rd2);
1130       if (rd >= best_yrd)
1131         return INT64_MAX;
1132     }
1133   }
1134
1135   *distortion = thisdistortion >> 2;
1136   *labelyrate = thisrate;
1137   *sse = thissse >> 2;
1138
1139   return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
1140 }
1141
1142 typedef struct {
1143   int eobs;
1144   int brate;
1145   int byrate;
1146   int64_t bdist;
1147   int64_t bsse;
1148   int64_t brdcost;
1149   int_mv mvs[2];
1150   ENTROPY_CONTEXT ta[2];
1151   ENTROPY_CONTEXT tl[2];
1152 } SEG_RDSTAT;
1153
1154 typedef struct {
1155   int_mv *ref_mv[2];
1156   int_mv mvp;
1157
1158   int64_t segment_rd;
1159   int r;
1160   int64_t d;
1161   int64_t sse;
1162   int segment_yrate;
1163   PREDICTION_MODE modes[4];
1164   SEG_RDSTAT rdstat[4][INTER_MODES];
1165   int mvthresh;
1166 } BEST_SEG_INFO;
1167
1168 static INLINE int mv_check_bounds(const MACROBLOCK *x, const MV *mv) {
1169   return (mv->row >> 3) < x->mv_row_min ||
1170          (mv->row >> 3) > x->mv_row_max ||
1171          (mv->col >> 3) < x->mv_col_min ||
1172          (mv->col >> 3) > x->mv_col_max;
1173 }
1174
1175 static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
1176   MB_MODE_INFO *const mbmi = &x->e_mbd.mi[0]->mbmi;
1177   struct macroblock_plane *const p = &x->plane[0];
1178   struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
1179
1180   p->src.buf = &p->src.buf[raster_block_offset(BLOCK_8X8, i, p->src.stride)];
1181   assert(((intptr_t)pd->pre[0].buf & 0x7) == 0);
1182   pd->pre[0].buf = &pd->pre[0].buf[raster_block_offset(BLOCK_8X8, i,
1183                                                        pd->pre[0].stride)];
1184   if (has_second_ref(mbmi))
1185     pd->pre[1].buf = &pd->pre[1].buf[raster_block_offset(BLOCK_8X8, i,
1186                                                          pd->pre[1].stride)];
1187 }
1188
1189 static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
1190                                   struct buf_2d orig_pre[2]) {
1191   MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
1192   x->plane[0].src = orig_src;
1193   x->e_mbd.plane[0].pre[0] = orig_pre[0];
1194   if (has_second_ref(mbmi))
1195     x->e_mbd.plane[0].pre[1] = orig_pre[1];
1196 }
1197
1198 static INLINE int mv_has_subpel(const MV *mv) {
1199   return (mv->row & 0x0F) || (mv->col & 0x0F);
1200 }
1201
1202 // Check if NEARESTMV/NEARMV/ZEROMV is the cheapest way encode zero motion.
1203 // TODO(aconverse): Find out if this is still productive then clean up or remove
1204 static int check_best_zero_mv(
1205     const VP9_COMP *cpi, const uint8_t mode_context[MAX_REF_FRAMES],
1206     int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
1207     int inter_mode_mask, int this_mode,
1208     const MV_REFERENCE_FRAME ref_frames[2]) {
1209   if ((inter_mode_mask & (1 << ZEROMV)) &&
1210       (this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
1211       frame_mv[this_mode][ref_frames[0]].as_int == 0 &&
1212       (ref_frames[1] == NONE ||
1213        frame_mv[this_mode][ref_frames[1]].as_int == 0)) {
1214     int rfc = mode_context[ref_frames[0]];
1215     int c1 = cost_mv_ref(cpi, NEARMV, rfc);
1216     int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
1217     int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
1218
1219     if (this_mode == NEARMV) {
1220       if (c1 > c3) return 0;
1221     } else if (this_mode == NEARESTMV) {
1222       if (c2 > c3) return 0;
1223     } else {
1224       assert(this_mode == ZEROMV);
1225       if (ref_frames[1] == NONE) {
1226         if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0) ||
1227             (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0))
1228           return 0;
1229       } else {
1230         if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0 &&
1231              frame_mv[NEARESTMV][ref_frames[1]].as_int == 0) ||
1232             (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0 &&
1233              frame_mv[NEARMV][ref_frames[1]].as_int == 0))
1234           return 0;
1235       }
1236     }
1237   }
1238   return 1;
1239 }
1240
1241 static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
1242                                         const TileInfo * const tile,
1243                                         int_mv *best_ref_mv,
1244                                         int_mv *second_best_ref_mv,
1245                                         int64_t best_rd, int *returntotrate,
1246                                         int *returnyrate,
1247                                         int64_t *returndistortion,
1248                                         int *skippable, int64_t *psse,
1249                                         int mvthresh,
1250                                         int_mv seg_mvs[4][MAX_REF_FRAMES],
1251                                         BEST_SEG_INFO *bsi_buf, int filter_idx,
1252                                         int mi_row, int mi_col) {
1253   int i;
1254   BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
1255   MACROBLOCKD *xd = &x->e_mbd;
1256   MODE_INFO *mi = xd->mi[0];
1257   MB_MODE_INFO *mbmi = &mi->mbmi;
1258   int mode_idx;
1259   int k, br = 0, idx, idy;
1260   int64_t bd = 0, block_sse = 0;
1261   PREDICTION_MODE this_mode;
1262   VP9_COMMON *cm = &cpi->common;
1263   struct macroblock_plane *const p = &x->plane[0];
1264   struct macroblockd_plane *const pd = &xd->plane[0];
1265   const int label_count = 4;
1266   int64_t this_segment_rd = 0;
1267   int label_mv_thresh;
1268   int segmentyrate = 0;
1269   const BLOCK_SIZE bsize = mbmi->sb_type;
1270   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
1271   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
1272   ENTROPY_CONTEXT t_above[2], t_left[2];
1273   int subpelmv = 1, have_ref = 0;
1274   const int has_second_rf = has_second_ref(mbmi);
1275   const int inter_mode_mask = cpi->sf.inter_mode_mask[bsize];
1276
1277   vp9_zero(*bsi);
1278
1279   bsi->segment_rd = best_rd;
1280   bsi->ref_mv[0] = best_ref_mv;
1281   bsi->ref_mv[1] = second_best_ref_mv;
1282   bsi->mvp.as_int = best_ref_mv->as_int;
1283   bsi->mvthresh = mvthresh;
1284
1285   for (i = 0; i < 4; i++)
1286     bsi->modes[i] = ZEROMV;
1287
1288   vpx_memcpy(t_above, pd->above_context, sizeof(t_above));
1289   vpx_memcpy(t_left, pd->left_context, sizeof(t_left));
1290
1291   // 64 makes this threshold really big effectively
1292   // making it so that we very rarely check mvs on
1293   // segments.   setting this to 1 would make mv thresh
1294   // roughly equal to what it is for macroblocks
1295   label_mv_thresh = 1 * bsi->mvthresh / label_count;
1296
1297   // Segmentation method overheads
1298   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
1299     for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
1300       // TODO(jingning,rbultje): rewrite the rate-distortion optimization
1301       // loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop
1302       int_mv mode_mv[MB_MODE_COUNT][2];
1303       int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
1304       PREDICTION_MODE mode_selected = ZEROMV;
1305       int64_t best_rd = INT64_MAX;
1306       const int i = idy * 2 + idx;
1307       int ref;
1308
1309       for (ref = 0; ref < 1 + has_second_rf; ++ref) {
1310         const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
1311         frame_mv[ZEROMV][frame].as_int = 0;
1312         vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, i, ref, mi_row, mi_col,
1313                                       &frame_mv[NEARESTMV][frame],
1314                                       &frame_mv[NEARMV][frame]);
1315       }
1316
1317       // search for the best motion vector on this segment
1318       for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
1319         const struct buf_2d orig_src = x->plane[0].src;
1320         struct buf_2d orig_pre[2];
1321
1322         mode_idx = INTER_OFFSET(this_mode);
1323         bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
1324         if (!(inter_mode_mask & (1 << this_mode)))
1325           continue;
1326
1327         if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
1328                                 inter_mode_mask,
1329                                 this_mode, mbmi->ref_frame))
1330           continue;
1331
1332         vpx_memcpy(orig_pre, pd->pre, sizeof(orig_pre));
1333         vpx_memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
1334                    sizeof(bsi->rdstat[i][mode_idx].ta));
1335         vpx_memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
1336                    sizeof(bsi->rdstat[i][mode_idx].tl));
1337
1338         // motion search for newmv (single predictor case only)
1339         if (!has_second_rf && this_mode == NEWMV &&
1340             seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) {
1341           MV *const new_mv = &mode_mv[NEWMV][0].as_mv;
1342           int step_param = 0;
1343           int thissme, bestsme = INT_MAX;
1344           int sadpb = x->sadperbit4;
1345           MV mvp_full;
1346           int max_mv;
1347
1348           /* Is the best so far sufficiently good that we cant justify doing
1349            * and new motion search. */
1350           if (best_rd < label_mv_thresh)
1351             break;
1352
1353           if (!is_best_mode(cpi->oxcf.mode)) {
1354             // use previous block's result as next block's MV predictor.
1355             if (i > 0) {
1356               bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int;
1357               if (i == 2)
1358                 bsi->mvp.as_int = mi->bmi[i - 2].as_mv[0].as_int;
1359             }
1360           }
1361           if (i == 0)
1362             max_mv = x->max_mv_context[mbmi->ref_frame[0]];
1363           else
1364             max_mv = MAX(abs(bsi->mvp.as_mv.row), abs(bsi->mvp.as_mv.col)) >> 3;
1365
1366           if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
1367             // Take wtd average of the step_params based on the last frame's
1368             // max mv magnitude and the best ref mvs of the current block for
1369             // the given reference.
1370             step_param = (vp9_init_search_range(max_mv) +
1371                               cpi->mv_step_param) / 2;
1372           } else {
1373             step_param = cpi->mv_step_param;
1374           }
1375
1376           mvp_full.row = bsi->mvp.as_mv.row >> 3;
1377           mvp_full.col = bsi->mvp.as_mv.col >> 3;
1378
1379           if (cpi->sf.adaptive_motion_search && cm->show_frame) {
1380             mvp_full.row = x->pred_mv[mbmi->ref_frame[0]].row >> 3;
1381             mvp_full.col = x->pred_mv[mbmi->ref_frame[0]].col >> 3;
1382             step_param = MAX(step_param, 8);
1383           }
1384
1385           // adjust src pointer for this block
1386           mi_buf_shift(x, i);
1387
1388           vp9_set_mv_search_range(x, &bsi->ref_mv[0]->as_mv);
1389
1390           bestsme = vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
1391                                           sadpb, &bsi->ref_mv[0]->as_mv, new_mv,
1392                                           INT_MAX, 1);
1393
1394           // Should we do a full search (best quality only)
1395           if (is_best_mode(cpi->oxcf.mode)) {
1396             int_mv *const best_mv = &mi->bmi[i].as_mv[0];
1397             /* Check if mvp_full is within the range. */
1398             clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
1399                      x->mv_row_min, x->mv_row_max);
1400             thissme = cpi->full_search_sad(x, &mvp_full,
1401                                            sadpb, 16, &cpi->fn_ptr[bsize],
1402                                            &bsi->ref_mv[0]->as_mv,
1403                                            &best_mv->as_mv);
1404             if (thissme < bestsme) {
1405               bestsme = thissme;
1406               *new_mv = best_mv->as_mv;
1407             } else {
1408               // The full search result is actually worse so re-instate the
1409               // previous best vector
1410               best_mv->as_mv = *new_mv;
1411             }
1412           }
1413
1414           if (bestsme < INT_MAX) {
1415             int distortion;
1416             cpi->find_fractional_mv_step(x,
1417                                          new_mv,
1418                                          &bsi->ref_mv[0]->as_mv,
1419                                          cm->allow_high_precision_mv,
1420                                          x->errorperbit, &cpi->fn_ptr[bsize],
1421                                          cpi->sf.mv.subpel_force_stop,
1422                                          cpi->sf.mv.subpel_iters_per_step,
1423                                          x->nmvjointcost, x->mvcost,
1424                                          &distortion,
1425                                          &x->pred_sse[mbmi->ref_frame[0]],
1426                                          NULL, 0, 0);
1427
1428             // save motion search result for use in compound prediction
1429             seg_mvs[i][mbmi->ref_frame[0]].as_mv = *new_mv;
1430           }
1431
1432           if (cpi->sf.adaptive_motion_search)
1433             x->pred_mv[mbmi->ref_frame[0]] = *new_mv;
1434
1435           // restore src pointers
1436           mi_buf_restore(x, orig_src, orig_pre);
1437         }
1438
1439         if (has_second_rf) {
1440           if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV ||
1441               seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV)
1442             continue;
1443         }
1444
1445         if (has_second_rf && this_mode == NEWMV &&
1446             mbmi->interp_filter == EIGHTTAP) {
1447           // adjust src pointers
1448           mi_buf_shift(x, i);
1449           if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
1450             int rate_mv;
1451             joint_motion_search(cpi, x, bsize, frame_mv[this_mode],
1452                                 mi_row, mi_col, seg_mvs[i],
1453                                 &rate_mv);
1454             seg_mvs[i][mbmi->ref_frame[0]].as_int =
1455                 frame_mv[this_mode][mbmi->ref_frame[0]].as_int;
1456             seg_mvs[i][mbmi->ref_frame[1]].as_int =
1457                 frame_mv[this_mode][mbmi->ref_frame[1]].as_int;
1458           }
1459           // restore src pointers
1460           mi_buf_restore(x, orig_src, orig_pre);
1461         }
1462
1463         bsi->rdstat[i][mode_idx].brate =
1464             set_and_cost_bmi_mvs(cpi, xd, i, this_mode, mode_mv[this_mode],
1465                                  frame_mv, seg_mvs[i], bsi->ref_mv,
1466                                  x->nmvjointcost, x->mvcost);
1467
1468         for (ref = 0; ref < 1 + has_second_rf; ++ref) {
1469           bsi->rdstat[i][mode_idx].mvs[ref].as_int =
1470               mode_mv[this_mode][ref].as_int;
1471           if (num_4x4_blocks_wide > 1)
1472             bsi->rdstat[i + 1][mode_idx].mvs[ref].as_int =
1473                 mode_mv[this_mode][ref].as_int;
1474           if (num_4x4_blocks_high > 1)
1475             bsi->rdstat[i + 2][mode_idx].mvs[ref].as_int =
1476                 mode_mv[this_mode][ref].as_int;
1477         }
1478
1479         // Trap vectors that reach beyond the UMV borders
1480         if (mv_check_bounds(x, &mode_mv[this_mode][0].as_mv) ||
1481             (has_second_rf &&
1482              mv_check_bounds(x, &mode_mv[this_mode][1].as_mv)))
1483           continue;
1484
1485         if (filter_idx > 0) {
1486           BEST_SEG_INFO *ref_bsi = bsi_buf;
1487           subpelmv = 0;
1488           have_ref = 1;
1489
1490           for (ref = 0; ref < 1 + has_second_rf; ++ref) {
1491             subpelmv |= mv_has_subpel(&mode_mv[this_mode][ref].as_mv);
1492             have_ref &= mode_mv[this_mode][ref].as_int ==
1493                 ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int;
1494           }
1495
1496           if (filter_idx > 1 && !subpelmv && !have_ref) {
1497             ref_bsi = bsi_buf + 1;
1498             have_ref = 1;
1499             for (ref = 0; ref < 1 + has_second_rf; ++ref)
1500               have_ref &= mode_mv[this_mode][ref].as_int ==
1501                   ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int;
1502           }
1503
1504           if (!subpelmv && have_ref &&
1505               ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
1506             vpx_memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
1507                        sizeof(SEG_RDSTAT));
1508             if (num_4x4_blocks_wide > 1)
1509               bsi->rdstat[i + 1][mode_idx].eobs =
1510                   ref_bsi->rdstat[i + 1][mode_idx].eobs;
1511             if (num_4x4_blocks_high > 1)
1512               bsi->rdstat[i + 2][mode_idx].eobs =
1513                   ref_bsi->rdstat[i + 2][mode_idx].eobs;
1514
1515             if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
1516               mode_selected = this_mode;
1517               best_rd = bsi->rdstat[i][mode_idx].brdcost;
1518             }
1519             continue;
1520           }
1521         }
1522
1523         bsi->rdstat[i][mode_idx].brdcost =
1524             encode_inter_mb_segment(cpi, x,
1525                                     bsi->segment_rd - this_segment_rd, i,
1526                                     &bsi->rdstat[i][mode_idx].byrate,
1527                                     &bsi->rdstat[i][mode_idx].bdist,
1528                                     &bsi->rdstat[i][mode_idx].bsse,
1529                                     bsi->rdstat[i][mode_idx].ta,
1530                                     bsi->rdstat[i][mode_idx].tl,
1531                                     mi_row, mi_col);
1532         if (bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
1533           bsi->rdstat[i][mode_idx].brdcost += RDCOST(x->rdmult, x->rddiv,
1534                                             bsi->rdstat[i][mode_idx].brate, 0);
1535           bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate;
1536           bsi->rdstat[i][mode_idx].eobs = p->eobs[i];
1537           if (num_4x4_blocks_wide > 1)
1538             bsi->rdstat[i + 1][mode_idx].eobs = p->eobs[i + 1];
1539           if (num_4x4_blocks_high > 1)
1540             bsi->rdstat[i + 2][mode_idx].eobs = p->eobs[i + 2];
1541         }
1542
1543         if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
1544           mode_selected = this_mode;
1545           best_rd = bsi->rdstat[i][mode_idx].brdcost;
1546         }
1547       } /*for each 4x4 mode*/
1548
1549       if (best_rd == INT64_MAX) {
1550         int iy, midx;
1551         for (iy = i + 1; iy < 4; ++iy)
1552           for (midx = 0; midx < INTER_MODES; ++midx)
1553             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
1554         bsi->segment_rd = INT64_MAX;
1555         return INT64_MAX;;
1556       }
1557
1558       mode_idx = INTER_OFFSET(mode_selected);
1559       vpx_memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above));
1560       vpx_memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left));
1561
1562       set_and_cost_bmi_mvs(cpi, xd, i, mode_selected, mode_mv[mode_selected],
1563                            frame_mv, seg_mvs[i], bsi->ref_mv, x->nmvjointcost,
1564                            x->mvcost);
1565
1566       br += bsi->rdstat[i][mode_idx].brate;
1567       bd += bsi->rdstat[i][mode_idx].bdist;
1568       block_sse += bsi->rdstat[i][mode_idx].bsse;
1569       segmentyrate += bsi->rdstat[i][mode_idx].byrate;
1570       this_segment_rd += bsi->rdstat[i][mode_idx].brdcost;
1571
1572       if (this_segment_rd > bsi->segment_rd) {
1573         int iy, midx;
1574         for (iy = i + 1; iy < 4; ++iy)
1575           for (midx = 0; midx < INTER_MODES; ++midx)
1576             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
1577         bsi->segment_rd = INT64_MAX;
1578         return INT64_MAX;;
1579       }
1580     }
1581   } /* for each label */
1582
1583   bsi->r = br;
1584   bsi->d = bd;
1585   bsi->segment_yrate = segmentyrate;
1586   bsi->segment_rd = this_segment_rd;
1587   bsi->sse = block_sse;
1588
1589   // update the coding decisions
1590   for (k = 0; k < 4; ++k)
1591     bsi->modes[k] = mi->bmi[k].as_mode;
1592
1593   if (bsi->segment_rd > best_rd)
1594     return INT64_MAX;
1595   /* set it to the best */
1596   for (i = 0; i < 4; i++) {
1597     mode_idx = INTER_OFFSET(bsi->modes[i]);
1598     mi->bmi[i].as_mv[0].as_int = bsi->rdstat[i][mode_idx].mvs[0].as_int;
1599     if (has_second_ref(mbmi))
1600       mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int;
1601     x->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs;
1602     mi->bmi[i].as_mode = bsi->modes[i];
1603   }
1604
1605   /*
1606    * used to set mbmi->mv.as_int
1607    */
1608   *returntotrate = bsi->r;
1609   *returndistortion = bsi->d;
1610   *returnyrate = bsi->segment_yrate;
1611   *skippable = vp9_is_skippable_in_plane(x, BLOCK_8X8, 0);
1612   *psse = bsi->sse;
1613   mbmi->mode = bsi->modes[3];
1614
1615   return bsi->segment_rd;
1616 }
1617
1618 static void estimate_ref_frame_costs(const VP9_COMMON *cm,
1619                                      const MACROBLOCKD *xd,
1620                                      int segment_id,
1621                                      unsigned int *ref_costs_single,
1622                                      unsigned int *ref_costs_comp,
1623                                      vp9_prob *comp_mode_p) {
1624   int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id,
1625                                              SEG_LVL_REF_FRAME);
1626   if (seg_ref_active) {
1627     vpx_memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single));
1628     vpx_memset(ref_costs_comp,   0, MAX_REF_FRAMES * sizeof(*ref_costs_comp));
1629     *comp_mode_p = 128;
1630   } else {
1631     vp9_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
1632     vp9_prob comp_inter_p = 128;
1633
1634     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
1635       comp_inter_p = vp9_get_reference_mode_prob(cm, xd);
1636       *comp_mode_p = comp_inter_p;
1637     } else {
1638       *comp_mode_p = 128;
1639     }
1640
1641     ref_costs_single[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
1642
1643     if (cm->reference_mode != COMPOUND_REFERENCE) {
1644       vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
1645       vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
1646       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
1647
1648       if (cm->reference_mode == REFERENCE_MODE_SELECT)
1649         base_cost += vp9_cost_bit(comp_inter_p, 0);
1650
1651       ref_costs_single[LAST_FRAME] = ref_costs_single[GOLDEN_FRAME] =
1652           ref_costs_single[ALTREF_FRAME] = base_cost;
1653       ref_costs_single[LAST_FRAME]   += vp9_cost_bit(ref_single_p1, 0);
1654       ref_costs_single[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p1, 1);
1655       ref_costs_single[ALTREF_FRAME] += vp9_cost_bit(ref_single_p1, 1);
1656       ref_costs_single[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p2, 0);
1657       ref_costs_single[ALTREF_FRAME] += vp9_cost_bit(ref_single_p2, 1);
1658     } else {
1659       ref_costs_single[LAST_FRAME]   = 512;
1660       ref_costs_single[GOLDEN_FRAME] = 512;
1661       ref_costs_single[ALTREF_FRAME] = 512;
1662     }
1663     if (cm->reference_mode != SINGLE_REFERENCE) {
1664       vp9_prob ref_comp_p = vp9_get_pred_prob_comp_ref_p(cm, xd);
1665       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
1666
1667       if (cm->reference_mode == REFERENCE_MODE_SELECT)
1668         base_cost += vp9_cost_bit(comp_inter_p, 1);
1669
1670       ref_costs_comp[LAST_FRAME]   = base_cost + vp9_cost_bit(ref_comp_p, 0);
1671       ref_costs_comp[GOLDEN_FRAME] = base_cost + vp9_cost_bit(ref_comp_p, 1);
1672     } else {
1673       ref_costs_comp[LAST_FRAME]   = 512;
1674       ref_costs_comp[GOLDEN_FRAME] = 512;
1675     }
1676   }
1677 }
1678
1679 static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
1680                          int mode_index,
1681                          int64_t comp_pred_diff[REFERENCE_MODES],
1682                          const int64_t tx_size_diff[TX_MODES],
1683                          int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]) {
1684   MACROBLOCKD *const xd = &x->e_mbd;
1685
1686   // Take a snapshot of the coding context so it can be
1687   // restored if we decide to encode this way
1688   ctx->skip = x->skip;
1689   ctx->best_mode_index = mode_index;
1690   ctx->mic = *xd->mi[0];
1691   ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
1692   ctx->comp_pred_diff   = (int)comp_pred_diff[COMPOUND_REFERENCE];
1693   ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
1694
1695   vpx_memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
1696   vpx_memcpy(ctx->best_filter_diff, best_filter_diff,
1697              sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS);
1698 }
1699
1700 static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
1701                                const TileInfo *const tile,
1702                                MV_REFERENCE_FRAME ref_frame,
1703                                BLOCK_SIZE block_size,
1704                                int mi_row, int mi_col,
1705                                int_mv frame_nearest_mv[MAX_REF_FRAMES],
1706                                int_mv frame_near_mv[MAX_REF_FRAMES],
1707                                struct buf_2d yv12_mb[4][MAX_MB_PLANE]) {
1708   const VP9_COMMON *cm = &cpi->common;
1709   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
1710   MACROBLOCKD *const xd = &x->e_mbd;
1711   MODE_INFO *const mi = xd->mi[0];
1712   int_mv *const candidates = mi->mbmi.ref_mvs[ref_frame];
1713   const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
1714
1715   // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
1716   // use the UV scaling factors.
1717   vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
1718
1719   // Gets an initial list of candidate vectors from neighbours and orders them
1720   vp9_find_mv_refs(cm, xd, tile, mi, ref_frame, candidates, mi_row, mi_col);
1721
1722   // Candidate refinement carried out at encoder and decoder
1723   vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
1724                         &frame_nearest_mv[ref_frame],
1725                         &frame_near_mv[ref_frame]);
1726
1727   // Further refinement that is encode side only to test the top few candidates
1728   // in full and choose the best as the centre point for subsequent searches.
1729   // The current implementation doesn't support scaling.
1730   if (!vp9_is_scaled(sf) && block_size >= BLOCK_8X8)
1731     vp9_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride,
1732                 ref_frame, block_size);
1733 }
1734
1735 static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
1736                                  BLOCK_SIZE bsize,
1737                                  int mi_row, int mi_col,
1738                                  int_mv *tmp_mv, int *rate_mv) {
1739   MACROBLOCKD *xd = &x->e_mbd;
1740   const VP9_COMMON *cm = &cpi->common;
1741   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
1742   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
1743   int bestsme = INT_MAX;
1744   int step_param;
1745   int sadpb = x->sadperbit16;
1746   MV mvp_full;
1747   int ref = mbmi->ref_frame[0];
1748   MV ref_mv = mbmi->ref_mvs[ref][0].as_mv;
1749
1750   int tmp_col_min = x->mv_col_min;
1751   int tmp_col_max = x->mv_col_max;
1752   int tmp_row_min = x->mv_row_min;
1753   int tmp_row_max = x->mv_row_max;
1754
1755   const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
1756                                                                         ref);
1757
1758   MV pred_mv[3];
1759   pred_mv[0] = mbmi->ref_mvs[ref][0].as_mv;
1760   pred_mv[1] = mbmi->ref_mvs[ref][1].as_mv;
1761   pred_mv[2] = x->pred_mv[ref];
1762
1763   if (scaled_ref_frame) {
1764     int i;
1765     // Swap out the reference frame for a version that's been scaled to
1766     // match the resolution of the current frame, allowing the existing
1767     // motion search code to be used without additional modifications.
1768     for (i = 0; i < MAX_MB_PLANE; i++)
1769       backup_yv12[i] = xd->plane[i].pre[0];
1770
1771     vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
1772   }
1773
1774   vp9_set_mv_search_range(x, &ref_mv);
1775
1776   // Work out the size of the first step in the mv step search.
1777   // 0 here is maximum length first step. 1 is MAX >> 1 etc.
1778   if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
1779     // Take wtd average of the step_params based on the last frame's
1780     // max mv magnitude and that based on the best ref mvs of the current
1781     // block for the given reference.
1782     step_param = (vp9_init_search_range(x->max_mv_context[ref]) +
1783                     cpi->mv_step_param) / 2;
1784   } else {
1785     step_param = cpi->mv_step_param;
1786   }
1787
1788   if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64 &&
1789       cm->show_frame) {
1790     int boffset = 2 * (b_width_log2(BLOCK_64X64) - MIN(b_height_log2(bsize),
1791                                                        b_width_log2(bsize)));
1792     step_param = MAX(step_param, boffset);
1793   }
1794
1795   if (cpi->sf.adaptive_motion_search) {
1796     int bwl = b_width_log2(bsize);
1797     int bhl = b_height_log2(bsize);
1798     int i;
1799     int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
1800
1801     if (tlevel < 5)
1802       step_param += 2;
1803
1804     for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
1805       if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
1806         x->pred_mv[ref].row = 0;
1807         x->pred_mv[ref].col = 0;
1808         tmp_mv->as_int = INVALID_MV;
1809
1810         if (scaled_ref_frame) {
1811           int i;
1812           for (i = 0; i < MAX_MB_PLANE; i++)
1813             xd->plane[i].pre[0] = backup_yv12[i];
1814         }
1815         return;
1816       }
1817     }
1818   }
1819
1820   mvp_full = pred_mv[x->mv_best_ref_index[ref]];
1821
1822   mvp_full.col >>= 3;
1823   mvp_full.row >>= 3;
1824
1825   bestsme = vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
1826                                   &ref_mv, &tmp_mv->as_mv, INT_MAX, 1);
1827
1828   x->mv_col_min = tmp_col_min;
1829   x->mv_col_max = tmp_col_max;
1830   x->mv_row_min = tmp_row_min;
1831   x->mv_row_max = tmp_row_max;
1832
1833   if (bestsme < INT_MAX) {
1834     int dis;  /* TODO: use dis in distortion calculation later. */
1835     cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
1836                                  cm->allow_high_precision_mv,
1837                                  x->errorperbit,
1838                                  &cpi->fn_ptr[bsize],
1839                                  cpi->sf.mv.subpel_force_stop,
1840                                  cpi->sf.mv.subpel_iters_per_step,
1841                                  x->nmvjointcost, x->mvcost,
1842                                  &dis, &x->pred_sse[ref], NULL, 0, 0);
1843   }
1844   *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
1845                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
1846
1847   if (cpi->sf.adaptive_motion_search && cm->show_frame)
1848     x->pred_mv[ref] = tmp_mv->as_mv;
1849
1850   if (scaled_ref_frame) {
1851     int i;
1852     for (i = 0; i < MAX_MB_PLANE; i++)
1853       xd->plane[i].pre[0] = backup_yv12[i];
1854   }
1855 }
1856
1857 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
1858                                 BLOCK_SIZE bsize,
1859                                 int_mv *frame_mv,
1860                                 int mi_row, int mi_col,
1861                                 int_mv single_newmv[MAX_REF_FRAMES],
1862                                 int *rate_mv) {
1863   const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
1864   const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
1865   MACROBLOCKD *xd = &x->e_mbd;
1866   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
1867   const int refs[2] = { mbmi->ref_frame[0],
1868                         mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
1869   int_mv ref_mv[2];
1870   int ite, ref;
1871   // Prediction buffer from second frame.
1872   uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
1873   const InterpKernel *kernel = vp9_get_interp_kernel(mbmi->interp_filter);
1874
1875   // Do joint motion search in compound mode to get more accurate mv.
1876   struct buf_2d backup_yv12[2][MAX_MB_PLANE];
1877   struct buf_2d scaled_first_yv12 = xd->plane[0].pre[0];
1878   int last_besterr[2] = {INT_MAX, INT_MAX};
1879   const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
1880     vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]),
1881     vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[1])
1882   };
1883
1884   for (ref = 0; ref < 2; ++ref) {
1885     ref_mv[ref] = mbmi->ref_mvs[refs[ref]][0];
1886
1887     if (scaled_ref_frame[ref]) {
1888       int i;
1889       // Swap out the reference frame for a version that's been scaled to
1890       // match the resolution of the current frame, allowing the existing
1891       // motion search code to be used without additional modifications.
1892       for (i = 0; i < MAX_MB_PLANE; i++)
1893         backup_yv12[ref][i] = xd->plane[i].pre[ref];
1894       vp9_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
1895                            NULL);
1896     }
1897
1898     frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int;
1899   }
1900
1901   // Allow joint search multiple times iteratively for each ref frame
1902   // and break out the search loop if it couldn't find better mv.
1903   for (ite = 0; ite < 4; ite++) {
1904     struct buf_2d ref_yv12[2];
1905     int bestsme = INT_MAX;
1906     int sadpb = x->sadperbit16;
1907     MV tmp_mv;
1908     int search_range = 3;
1909
1910     int tmp_col_min = x->mv_col_min;
1911     int tmp_col_max = x->mv_col_max;
1912     int tmp_row_min = x->mv_row_min;
1913     int tmp_row_max = x->mv_row_max;
1914     int id = ite % 2;
1915
1916     // Initialized here because of compiler problem in Visual Studio.
1917     ref_yv12[0] = xd->plane[0].pre[0];
1918     ref_yv12[1] = xd->plane[0].pre[1];
1919
1920     // Get pred block from second frame.
1921     vp9_build_inter_predictor(ref_yv12[!id].buf,
1922                               ref_yv12[!id].stride,
1923                               second_pred, pw,
1924                               &frame_mv[refs[!id]].as_mv,
1925                               &xd->block_refs[!id]->sf,
1926                               pw, ph, 0,
1927                               kernel, MV_PRECISION_Q3,
1928                               mi_col * MI_SIZE, mi_row * MI_SIZE);
1929
1930     // Compound motion search on first ref frame.
1931     if (id)
1932       xd->plane[0].pre[0] = ref_yv12[id];
1933     vp9_set_mv_search_range(x, &ref_mv[id].as_mv);
1934
1935     // Use mv result from single mode as mvp.
1936     tmp_mv = frame_mv[refs[id]].as_mv;
1937
1938     tmp_mv.col >>= 3;
1939     tmp_mv.row >>= 3;
1940
1941     // Small-range full-pixel motion search
1942     bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
1943                                        search_range,
1944                                        &cpi->fn_ptr[bsize],
1945                                        &ref_mv[id].as_mv, second_pred);
1946     if (bestsme < INT_MAX)
1947       bestsme = vp9_get_mvpred_av_var(x, &tmp_mv, &ref_mv[id].as_mv,
1948                                       second_pred, &cpi->fn_ptr[bsize], 1);
1949
1950     x->mv_col_min = tmp_col_min;
1951     x->mv_col_max = tmp_col_max;
1952     x->mv_row_min = tmp_row_min;
1953     x->mv_row_max = tmp_row_max;
1954
1955     if (bestsme < INT_MAX) {
1956       int dis; /* TODO: use dis in distortion calculation later. */
1957       unsigned int sse;
1958       bestsme = cpi->find_fractional_mv_step(
1959           x, &tmp_mv,
1960           &ref_mv[id].as_mv,
1961           cpi->common.allow_high_precision_mv,
1962           x->errorperbit,
1963           &cpi->fn_ptr[bsize],
1964           0, cpi->sf.mv.subpel_iters_per_step,
1965           x->nmvjointcost, x->mvcost,
1966           &dis, &sse, second_pred,
1967           pw, ph);
1968     }
1969
1970     if (id)
1971       xd->plane[0].pre[0] = scaled_first_yv12;
1972
1973     if (bestsme < last_besterr[id]) {
1974       frame_mv[refs[id]].as_mv = tmp_mv;
1975       last_besterr[id] = bestsme;
1976     } else {
1977       break;
1978     }
1979   }
1980
1981   *rate_mv = 0;
1982
1983   for (ref = 0; ref < 2; ++ref) {
1984     if (scaled_ref_frame[ref]) {
1985       // restore the predictor
1986       int i;
1987       for (i = 0; i < MAX_MB_PLANE; i++)
1988         xd->plane[i].pre[ref] = backup_yv12[ref][i];
1989     }
1990
1991     *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
1992                                 &mbmi->ref_mvs[refs[ref]][0].as_mv,
1993                                 x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
1994   }
1995
1996   vpx_free(second_pred);
1997 }
1998
1999 static INLINE void restore_dst_buf(MACROBLOCKD *xd,
2000                                    uint8_t *orig_dst[MAX_MB_PLANE],
2001                                    int orig_dst_stride[MAX_MB_PLANE]) {
2002   int i;
2003   for (i = 0; i < MAX_MB_PLANE; i++) {
2004     xd->plane[i].dst.buf = orig_dst[i];
2005     xd->plane[i].dst.stride = orig_dst_stride[i];
2006   }
2007 }
2008
2009 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
2010                                  BLOCK_SIZE bsize,
2011                                  int64_t txfm_cache[],
2012                                  int *rate2, int64_t *distortion,
2013                                  int *skippable,
2014                                  int *rate_y, int64_t *distortion_y,
2015                                  int *rate_uv, int64_t *distortion_uv,
2016                                  int *disable_skip,
2017                                  int_mv (*mode_mv)[MAX_REF_FRAMES],
2018                                  int mi_row, int mi_col,
2019                                  int_mv single_newmv[MAX_REF_FRAMES],
2020                                  int64_t *psse,
2021                                  const int64_t ref_best_rd) {
2022   VP9_COMMON *cm = &cpi->common;
2023   RD_OPT *rd_opt = &cpi->rd;
2024   MACROBLOCKD *xd = &x->e_mbd;
2025   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
2026   const int is_comp_pred = has_second_ref(mbmi);
2027   const int this_mode = mbmi->mode;
2028   int_mv *frame_mv = mode_mv[this_mode];
2029   int i;
2030   int refs[2] = { mbmi->ref_frame[0],
2031     (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
2032   int_mv cur_mv[2];
2033   int64_t this_rd = 0;
2034   DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64);
2035   int pred_exists = 0;
2036   int intpel_mv;
2037   int64_t rd, best_rd = INT64_MAX;
2038   int best_needs_copy = 0;
2039   uint8_t *orig_dst[MAX_MB_PLANE];
2040   int orig_dst_stride[MAX_MB_PLANE];
2041   int rs = 0;
2042   INTERP_FILTER best_filter = SWITCHABLE;
2043
2044   int bsl = mi_width_log2_lookup[bsize];
2045   int pred_filter_search = cpi->sf.cb_pred_filter_search ?
2046       (((mi_row + mi_col) >> bsl)) & 0x01 : 0;
2047
2048   if (pred_filter_search) {
2049     INTERP_FILTER af = SWITCHABLE, lf = SWITCHABLE;
2050     if (xd->up_available)
2051       af = xd->mi[-xd->mi_stride]->mbmi.interp_filter;
2052     if (xd->left_available)
2053       lf = xd->mi[-1]->mbmi.interp_filter;
2054
2055     if ((this_mode != NEWMV) || (af == lf))
2056       best_filter = af;
2057   }
2058
2059   if (is_comp_pred) {
2060     if (frame_mv[refs[0]].as_int == INVALID_MV ||
2061         frame_mv[refs[1]].as_int == INVALID_MV)
2062       return INT64_MAX;
2063   }
2064
2065   if (this_mode == NEWMV) {
2066     int rate_mv;
2067     if (is_comp_pred) {
2068       // Initialize mv using single prediction mode result.
2069       frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
2070       frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
2071
2072       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
2073         joint_motion_search(cpi, x, bsize, frame_mv,
2074                             mi_row, mi_col, single_newmv, &rate_mv);
2075       } else {
2076         rate_mv  = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
2077                                    &mbmi->ref_mvs[refs[0]][0].as_mv,
2078                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
2079         rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv,
2080                                    &mbmi->ref_mvs[refs[1]][0].as_mv,
2081                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
2082       }
2083       *rate2 += rate_mv;
2084     } else {
2085       int_mv tmp_mv;
2086       single_motion_search(cpi, x, bsize, mi_row, mi_col,
2087                            &tmp_mv, &rate_mv);
2088       if (tmp_mv.as_int == INVALID_MV)
2089         return INT64_MAX;
2090       *rate2 += rate_mv;
2091       frame_mv[refs[0]].as_int =
2092           xd->mi[0]->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
2093       single_newmv[refs[0]].as_int = tmp_mv.as_int;
2094     }
2095   }
2096
2097   for (i = 0; i < is_comp_pred + 1; ++i) {
2098     cur_mv[i] = frame_mv[refs[i]];
2099     // Clip "next_nearest" so that it does not extend to far out of image
2100     if (this_mode != NEWMV)
2101       clamp_mv2(&cur_mv[i].as_mv, xd);
2102
2103     if (mv_check_bounds(x, &cur_mv[i].as_mv))
2104       return INT64_MAX;
2105     mbmi->mv[i].as_int = cur_mv[i].as_int;
2106   }
2107
2108   // do first prediction into the destination buffer. Do the next
2109   // prediction into a temporary buffer. Then keep track of which one
2110   // of these currently holds the best predictor, and use the other
2111   // one for future predictions. In the end, copy from tmp_buf to
2112   // dst if necessary.
2113   for (i = 0; i < MAX_MB_PLANE; i++) {
2114     orig_dst[i] = xd->plane[i].dst.buf;
2115     orig_dst_stride[i] = xd->plane[i].dst.stride;
2116   }
2117
2118   /* We don't include the cost of the second reference here, because there
2119    * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
2120    * words if you present them in that order, the second one is always known
2121    * if the first is known */
2122   *rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]);
2123
2124   pred_exists = 0;
2125   // Are all MVs integer pel for Y and UV
2126   intpel_mv = !mv_has_subpel(&mbmi->mv[0].as_mv);
2127   if (is_comp_pred)
2128     intpel_mv &= !mv_has_subpel(&mbmi->mv[1].as_mv);
2129
2130   // Search for best switchable filter by checking the variance of
2131   // pred error irrespective of whether the filter will be used
2132   rd_opt->mask_filter = 0;
2133   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
2134     rd_opt->filter_cache[i] = INT64_MAX;
2135
2136   if (cm->interp_filter != BILINEAR) {
2137     if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
2138       best_filter = EIGHTTAP;
2139     } else if (best_filter == SWITCHABLE) {
2140       int newbest;
2141       int tmp_rate_sum = 0;
2142       int64_t tmp_dist_sum = 0;
2143
2144       for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
2145         int j;
2146         int64_t rs_rd;
2147         mbmi->interp_filter = i;
2148         rs = vp9_get_switchable_rate(cpi);
2149         rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
2150
2151         if (i > 0 && intpel_mv) {
2152           rd = RDCOST(x->rdmult, x->rddiv, tmp_rate_sum, tmp_dist_sum);
2153           rd_opt->filter_cache[i] = rd;
2154           rd_opt->filter_cache[SWITCHABLE_FILTERS] =
2155               MIN(rd_opt->filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
2156           if (cm->interp_filter == SWITCHABLE)
2157             rd += rs_rd;
2158           rd_opt->mask_filter = MAX(rd_opt->mask_filter, rd);
2159         } else {
2160           int rate_sum = 0;
2161           int64_t dist_sum = 0;
2162           if ((cm->interp_filter == SWITCHABLE &&
2163                (!i || best_needs_copy)) ||
2164               (cm->interp_filter != SWITCHABLE &&
2165                (cm->interp_filter == mbmi->interp_filter ||
2166                 (i == 0 && intpel_mv)))) {
2167             restore_dst_buf(xd, orig_dst, orig_dst_stride);
2168           } else {
2169             for (j = 0; j < MAX_MB_PLANE; j++) {
2170               xd->plane[j].dst.buf = tmp_buf + j * 64 * 64;
2171               xd->plane[j].dst.stride = 64;
2172             }
2173           }
2174           vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
2175           model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);
2176
2177           rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
2178           rd_opt->filter_cache[i] = rd;
2179           rd_opt->filter_cache[SWITCHABLE_FILTERS] =
2180               MIN(rd_opt->filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
2181           if (cm->interp_filter == SWITCHABLE)
2182             rd += rs_rd;
2183           rd_opt->mask_filter = MAX(rd_opt->mask_filter, rd);
2184
2185           if (i == 0 && intpel_mv) {
2186             tmp_rate_sum = rate_sum;
2187             tmp_dist_sum = dist_sum;
2188           }
2189         }
2190
2191         if (i == 0 && cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
2192           if (rd / 2 > ref_best_rd) {
2193             restore_dst_buf(xd, orig_dst, orig_dst_stride);
2194             return INT64_MAX;
2195           }
2196         }
2197         newbest = i == 0 || rd < best_rd;
2198
2199         if (newbest) {
2200           best_rd = rd;
2201           best_filter = mbmi->interp_filter;
2202           if (cm->interp_filter == SWITCHABLE && i && !intpel_mv)
2203             best_needs_copy = !best_needs_copy;
2204         }
2205
2206         if ((cm->interp_filter == SWITCHABLE && newbest) ||
2207             (cm->interp_filter != SWITCHABLE &&
2208              cm->interp_filter == mbmi->interp_filter)) {
2209           pred_exists = 1;
2210         }
2211       }
2212       restore_dst_buf(xd, orig_dst, orig_dst_stride);
2213     }
2214   }
2215   // Set the appropriate filter
2216   mbmi->interp_filter = cm->interp_filter != SWITCHABLE ?
2217       cm->interp_filter : best_filter;
2218   rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi) : 0;
2219
2220   if (pred_exists) {
2221     if (best_needs_copy) {
2222       // again temporarily set the buffers to local memory to prevent a memcpy
2223       for (i = 0; i < MAX_MB_PLANE; i++) {
2224         xd->plane[i].dst.buf = tmp_buf + i * 64 * 64;
2225         xd->plane[i].dst.stride = 64;
2226       }
2227     }
2228   } else {
2229     // Handles the special case when a filter that is not in the
2230     // switchable list (ex. bilinear, 6-tap) is indicated at the frame level
2231     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
2232   }
2233
2234   if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
2235     int tmp_rate;
2236     int64_t tmp_dist;
2237     model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist);
2238     rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
2239     // if current pred_error modeled rd is substantially more than the best
2240     // so far, do not bother doing full rd
2241     if (rd / 2 > ref_best_rd) {
2242       restore_dst_buf(xd, orig_dst, orig_dst_stride);
2243       return INT64_MAX;
2244     }
2245   }
2246
2247   if (cm->interp_filter == SWITCHABLE)
2248     *rate2 += vp9_get_switchable_rate(cpi);
2249
2250   if (!is_comp_pred) {
2251     if (cpi->allow_encode_breakout) {
2252       const BLOCK_SIZE y_size = get_plane_block_size(bsize, &xd->plane[0]);
2253       const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
2254       unsigned int var, sse;
2255       // Skipping threshold for ac.
2256       unsigned int thresh_ac;
2257       // Skipping threshold for dc
2258       unsigned int thresh_dc;
2259
2260       var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride,
2261                                    xd->plane[0].dst.buf,
2262                                    xd->plane[0].dst.stride, &sse);
2263
2264       if (x->encode_breakout > 0) {
2265         // Set a maximum for threshold to avoid big PSNR loss in low bitrate
2266         // case. Use extreme low threshold for static frames to limit skipping.
2267         const unsigned int max_thresh = (cpi->allow_encode_breakout ==
2268                                         ENCODE_BREAKOUT_LIMITED) ? 128 : 36000;
2269         // The encode_breakout input
2270         const unsigned int min_thresh =
2271             MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
2272
2273         // Calculate threshold according to dequant value.
2274         thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
2275         thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);
2276
2277         // Adjust threshold according to partition size.
2278         thresh_ac >>= 8 - (b_width_log2(bsize) +
2279             b_height_log2(bsize));
2280         thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
2281       } else {
2282         thresh_ac = 0;
2283         thresh_dc = 0;
2284       }
2285
2286       // Y skipping condition checking
2287       if (sse < thresh_ac || sse == 0) {
2288         // dc skipping checking
2289         if ((sse - var) < thresh_dc || sse == var) {
2290           unsigned int sse_u, sse_v;
2291           unsigned int var_u, var_v;
2292
2293           var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
2294                                           x->plane[1].src.stride,
2295                                           xd->plane[1].dst.buf,
2296                                           xd->plane[1].dst.stride, &sse_u);
2297
2298           // U skipping condition checking
2299           if ((sse_u * 4 < thresh_ac || sse_u == 0) &&
2300               (sse_u - var_u < thresh_dc || sse_u == var_u)) {
2301             var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
2302                                             x->plane[2].src.stride,
2303                                             xd->plane[2].dst.buf,
2304                                             xd->plane[2].dst.stride, &sse_v);
2305
2306             // V skipping condition checking
2307             if ((sse_v * 4 < thresh_ac || sse_v == 0) &&
2308                 (sse_v - var_v < thresh_dc || sse_v == var_v)) {
2309               x->skip = 1;
2310
2311               // The cost of skip bit needs to be added.
2312               *rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
2313
2314               // Scaling factor for SSE from spatial domain to frequency domain
2315               // is 16. Adjust distortion accordingly.
2316               *distortion_uv = (sse_u + sse_v) << 4;
2317               *distortion = (sse << 4) + *distortion_uv;
2318
2319               *disable_skip = 1;
2320               this_rd = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
2321             }
2322           }
2323         }
2324       }
2325     }
2326   }
2327
2328   if (!x->skip) {
2329     int skippable_y, skippable_uv;
2330     int64_t sseuv = INT64_MAX;
2331     int64_t rdcosty = INT64_MAX;
2332
2333     // Y cost and distortion
2334     inter_super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
2335                           bsize, txfm_cache, ref_best_rd);
2336
2337     if (*rate_y == INT_MAX) {
2338       *rate2 = INT_MAX;
2339       *distortion = INT64_MAX;
2340       restore_dst_buf(xd, orig_dst, orig_dst_stride);
2341       return INT64_MAX;
2342     }
2343
2344     *rate2 += *rate_y;
2345     *distortion += *distortion_y;
2346
2347     rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
2348     rdcosty = MIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
2349
2350     super_block_uvrd(cpi, x, rate_uv, distortion_uv, &skippable_uv, &sseuv,
2351                      bsize, ref_best_rd - rdcosty);
2352     if (*rate_uv == INT_MAX) {
2353       *rate2 = INT_MAX;
2354       *distortion = INT64_MAX;
2355       restore_dst_buf(xd, orig_dst, orig_dst_stride);
2356       return INT64_MAX;
2357     }
2358
2359     *psse += sseuv;
2360     *rate2 += *rate_uv;
2361     *distortion += *distortion_uv;
2362     *skippable = skippable_y && skippable_uv;
2363   }
2364
2365   restore_dst_buf(xd, orig_dst, orig_dst_stride);
2366   return this_rd;  // if 0, this will be re-calculated by caller
2367 }
2368
2369 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
2370                                int *returnrate, int64_t *returndist,
2371                                BLOCK_SIZE bsize,
2372                                PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
2373   VP9_COMMON *const cm = &cpi->common;
2374   MACROBLOCKD *const xd = &x->e_mbd;
2375   struct macroblockd_plane *const pd = xd->plane;
2376   int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
2377   int y_skip = 0, uv_skip = 0;
2378   int64_t dist_y = 0, dist_uv = 0, tx_cache[TX_MODES] = { 0 };
2379   TX_SIZE max_uv_tx_size;
2380   x->skip_encode = 0;
2381   ctx->skip = 0;
2382   xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME;
2383
2384   if (bsize >= BLOCK_8X8) {
2385     if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
2386                                &dist_y, &y_skip, bsize, tx_cache,
2387                                best_rd) >= best_rd) {
2388       *returnrate = INT_MAX;
2389       return;
2390     }
2391     max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize,
2392                                          pd[1].subsampling_x,
2393                                          pd[1].subsampling_y);
2394     rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
2395                             &dist_uv, &uv_skip, bsize, max_uv_tx_size);
2396   } else {
2397     y_skip = 0;
2398     if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
2399                                      &dist_y, best_rd) >= best_rd) {
2400       *returnrate = INT_MAX;
2401       return;
2402     }
2403     max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize,
2404                                          pd[1].subsampling_x,
2405                                          pd[1].subsampling_y);
2406     rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
2407                             &dist_uv, &uv_skip, BLOCK_8X8, max_uv_tx_size);
2408   }
2409
2410   if (y_skip && uv_skip) {
2411     *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
2412                   vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
2413     *returndist = dist_y + dist_uv;
2414     vp9_zero(ctx->tx_rd_diff);
2415   } else {
2416     int i;
2417     *returnrate = rate_y + rate_uv + vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
2418     *returndist = dist_y + dist_uv;
2419     if (cpi->sf.tx_size_search_method == USE_FULL_RD)
2420       for (i = 0; i < TX_MODES; i++) {
2421         if (tx_cache[i] < INT64_MAX && tx_cache[cm->tx_mode] < INT64_MAX)
2422           ctx->tx_rd_diff[i] = tx_cache[i] - tx_cache[cm->tx_mode];
2423         else
2424           ctx->tx_rd_diff[i] = 0;
2425       }
2426   }
2427
2428   ctx->mic = *xd->mi[0];
2429 }
2430
2431 // Updating rd_thresh_freq_fact[] here means that the different
2432 // partition/block sizes are handled independently based on the best
2433 // choice for the current partition. It may well be better to keep a scaled
2434 // best rd so far value and update rd_thresh_freq_fact based on the mode/size
2435 // combination that wins out.
2436 static void update_rd_thresh_fact(VP9_COMP *cpi, int bsize,
2437                                   int best_mode_index) {
2438   if (cpi->sf.adaptive_rd_thresh > 0) {
2439     const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
2440     int mode;
2441     for (mode = 0; mode < top_mode; ++mode) {
2442       int *const fact = &cpi->rd.thresh_freq_fact[bsize][mode];
2443
2444       if (mode == best_mode_index) {
2445         *fact -= (*fact >> 3);
2446       } else {
2447         *fact = MIN(*fact + RD_THRESH_INC,
2448                     cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
2449       }
2450     }
2451   }
2452 }
2453
2454 int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
2455                                   const TileInfo *const tile,
2456                                   int mi_row, int mi_col,
2457                                   int *returnrate,
2458                                   int64_t *returndistortion,
2459                                   BLOCK_SIZE bsize,
2460                                   PICK_MODE_CONTEXT *ctx,
2461                                   int64_t best_rd_so_far) {
2462   VP9_COMMON *const cm = &cpi->common;
2463   RD_OPT *const rd_opt = &cpi->rd;
2464   MACROBLOCKD *const xd = &x->e_mbd;
2465   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
2466   const struct segmentation *const seg = &cm->seg;
2467   struct macroblockd_plane *const pd = xd->plane;
2468   PREDICTION_MODE this_mode;
2469   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
2470   unsigned char segment_id = mbmi->segment_id;
2471   int comp_pred, i;
2472   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
2473   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
2474   int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
2475   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
2476                                     VP9_ALT_FLAG };
2477   int64_t best_rd = best_rd_so_far;
2478   int64_t best_tx_rd[TX_MODES];
2479   int64_t best_tx_diff[TX_MODES];
2480   int64_t best_pred_diff[REFERENCE_MODES];
2481   int64_t best_pred_rd[REFERENCE_MODES];
2482   int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
2483   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
2484   MB_MODE_INFO best_mbmode;
2485   int mode_index, best_mode_index = -1;
2486   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
2487   vp9_prob comp_mode_p;
2488   int64_t best_intra_rd = INT64_MAX;
2489   int64_t best_inter_rd = INT64_MAX;
2490   PREDICTION_MODE best_intra_mode = DC_PRED;
2491   MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
2492   int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
2493   int64_t dist_uv[TX_SIZES];
2494   int skip_uv[TX_SIZES];
2495   PREDICTION_MODE mode_uv[TX_SIZES];
2496   int64_t mode_distortions[MB_MODE_COUNT] = {-1};
2497   int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q);
2498   const int bws = num_8x8_blocks_wide_lookup[bsize] / 2;
2499   const int bhs = num_8x8_blocks_high_lookup[bsize] / 2;
2500   int best_skip2 = 0;
2501   int mode_skip_mask = 0;
2502   int mode_skip_start = cpi->sf.mode_skip_start + 1;
2503   const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
2504   const int *const rd_thresh_freq_fact = rd_opt->thresh_freq_fact[bsize];
2505   const int mode_search_skip_flags = cpi->sf.mode_search_skip_flags;
2506   const int intra_y_mode_mask =
2507       cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]];
2508   int inter_mode_mask = cpi->sf.inter_mode_mask[bsize];
2509   vp9_zero(best_mbmode);
2510   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
2511
2512   estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
2513                            &comp_mode_p);
2514
2515   for (i = 0; i < REFERENCE_MODES; ++i)
2516     best_pred_rd[i] = INT64_MAX;
2517   for (i = 0; i < TX_MODES; i++)
2518     best_tx_rd[i] = INT64_MAX;
2519   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
2520     best_filter_rd[i] = INT64_MAX;
2521   for (i = 0; i < TX_SIZES; i++)
2522     rate_uv_intra[i] = INT_MAX;
2523   for (i = 0; i < MAX_REF_FRAMES; ++i)
2524     x->pred_sse[i] = INT_MAX;
2525
2526   *returnrate = INT_MAX;
2527
2528   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
2529     x->pred_mv_sad[ref_frame] = INT_MAX;
2530     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
2531       setup_buffer_inter(cpi, x, tile,
2532                              ref_frame, bsize, mi_row, mi_col,
2533                              frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
2534     }
2535     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
2536     frame_mv[ZEROMV][ref_frame].as_int = 0;
2537   }
2538
2539   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
2540     // All modes from vp9_mode_order that use this frame as any ref
2541     static const int ref_frame_mask_all[] = {
2542         0x0, 0x123291, 0x25c444, 0x39b722
2543     };
2544     // Fixed mv modes (NEARESTMV, NEARMV, ZEROMV) from vp9_mode_order that use
2545     // this frame as their primary ref
2546     static const int ref_frame_mask_fixedmv[] = {
2547         0x0, 0x121281, 0x24c404, 0x080102
2548     };
2549     if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
2550       // Skip modes for missing references
2551       mode_skip_mask |= ref_frame_mask_all[ref_frame];
2552     } else if (cpi->sf.reference_masking) {
2553       for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
2554         // Skip fixed mv modes for poor references
2555         if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
2556           mode_skip_mask |= ref_frame_mask_fixedmv[ref_frame];
2557           break;
2558         }
2559       }
2560     }
2561     // If the segment reference frame feature is enabled....
2562     // then do nothing if the current ref frame is not allowed..
2563     if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
2564         vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
2565       mode_skip_mask |= ref_frame_mask_all[ref_frame];
2566     }
2567   }
2568
2569   // Disable this drop out case if the ref frame
2570   // segment level feature is enabled for this segment. This is to
2571   // prevent the possibility that we end up unable to pick any mode.
2572   if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
2573     // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
2574     // unless ARNR filtering is enabled in which case we want
2575     // an unfiltered alternative. We allow near/nearest as well
2576     // because they may result in zero-zero MVs but be cheaper.
2577     if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
2578       mode_skip_mask =
2579           ~((1 << THR_NEARESTA) | (1 << THR_NEARA) | (1 << THR_ZEROA));
2580       if (frame_mv[NEARMV][ALTREF_FRAME].as_int != 0)
2581         mode_skip_mask |= (1 << THR_NEARA);
2582       if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != 0)
2583         mode_skip_mask |= (1 << THR_NEARESTA);
2584     }
2585   }
2586
2587   // TODO(JBB): This is to make up for the fact that we don't have sad
2588   // functions that work when the block size reads outside the umv.  We
2589   // should fix this either by making the motion search just work on
2590   // a representative block in the boundary ( first ) and then implement a
2591   // function that does sads when inside the border..
2592   if ((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) {
2593     const int new_modes_mask =
2594         (1 << THR_NEWMV) | (1 << THR_NEWG) | (1 << THR_NEWA) |
2595         (1 << THR_COMP_NEWLA) | (1 << THR_COMP_NEWGA);
2596     mode_skip_mask |= new_modes_mask;
2597   }
2598
2599   if (bsize > cpi->sf.max_intra_bsize) {
2600     const int all_intra_modes = (1 << THR_DC) | (1 << THR_TM) |
2601         (1 << THR_H_PRED) | (1 << THR_V_PRED) | (1 << THR_D135_PRED) |
2602         (1 << THR_D207_PRED) | (1 << THR_D153_PRED) | (1 << THR_D63_PRED) |
2603         (1 << THR_D117_PRED) | (1 << THR_D45_PRED);
2604     mode_skip_mask |= all_intra_modes;
2605   }
2606
2607   for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
2608     int mode_excluded = 0;
2609     int64_t this_rd = INT64_MAX;
2610     int disable_skip = 0;
2611     int compmode_cost = 0;
2612     int rate2 = 0, rate_y = 0, rate_uv = 0;
2613     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
2614     int skippable = 0;
2615     int64_t tx_cache[TX_MODES];
2616     int i;
2617     int this_skip2 = 0;
2618     int64_t total_sse = INT64_MAX;
2619     int early_term = 0;
2620
2621     // Look at the reference frame of the best mode so far and set the
2622     // skip mask to look at a subset of the remaining modes.
2623     if (mode_index == mode_skip_start && best_mode_index >= 0) {
2624       switch (vp9_mode_order[best_mode_index].ref_frame[0]) {
2625         case INTRA_FRAME:
2626           break;
2627         case LAST_FRAME:
2628           mode_skip_mask |= LAST_FRAME_MODE_MASK;
2629           break;
2630         case GOLDEN_FRAME:
2631           mode_skip_mask |= GOLDEN_FRAME_MODE_MASK;
2632           break;
2633         case ALTREF_FRAME:
2634           mode_skip_mask |= ALT_REF_MODE_MASK;
2635           break;
2636         case NONE:
2637         case MAX_REF_FRAMES:
2638           assert(0 && "Invalid Reference frame");
2639       }
2640     }
2641     if (mode_skip_mask & (1 << mode_index))
2642       continue;
2643
2644     // Test best rd so far against threshold for trying this mode.
2645     if (rd_less_than_thresh(best_rd, rd_threshes[mode_index],
2646                             rd_thresh_freq_fact[mode_index]))
2647       continue;
2648
2649     this_mode = vp9_mode_order[mode_index].mode;
2650     ref_frame = vp9_mode_order[mode_index].ref_frame[0];
2651     if (ref_frame != INTRA_FRAME && !(inter_mode_mask & (1 << this_mode)))
2652       continue;
2653     second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
2654
2655     comp_pred = second_ref_frame > INTRA_FRAME;
2656     if (comp_pred) {
2657       if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
2658           best_mode_index >=0 &&
2659           vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME)
2660         continue;
2661       if ((mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH) &&
2662           ref_frame != best_inter_ref_frame &&
2663           second_ref_frame != best_inter_ref_frame)
2664         continue;
2665       mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
2666     } else {
2667       if (ref_frame != INTRA_FRAME)
2668         mode_excluded = cm->reference_mode == COMPOUND_REFERENCE;
2669     }
2670
2671     if (ref_frame == INTRA_FRAME) {
2672       if (!(intra_y_mode_mask & (1 << this_mode)))
2673         continue;
2674       if (this_mode != DC_PRED) {
2675         // Disable intra modes other than DC_PRED for blocks with low variance
2676         // Threshold for intra skipping based on source variance
2677         // TODO(debargha): Specialize the threshold for super block sizes
2678         const unsigned int skip_intra_var_thresh = 64;
2679         if ((mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
2680             x->source_variance < skip_intra_var_thresh)
2681           continue;
2682         // Only search the oblique modes if the best so far is
2683         // one of the neighboring directional modes
2684         if ((mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
2685             (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
2686           if (best_mode_index >= 0 &&
2687               vp9_mode_order[best_mode_index].ref_frame[0] > INTRA_FRAME)
2688             continue;
2689         }
2690         if (mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
2691           if (conditional_skipintra(this_mode, best_intra_mode))
2692               continue;
2693         }
2694       }
2695     } else {
2696       const MV_REFERENCE_FRAME ref_frames[2] = {ref_frame, second_ref_frame};
2697       if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
2698                               inter_mode_mask, this_mode, ref_frames))
2699         continue;
2700     }
2701
2702     mbmi->mode = this_mode;
2703     mbmi->uv_mode = DC_PRED;
2704     mbmi->ref_frame[0] = ref_frame;
2705     mbmi->ref_frame[1] = second_ref_frame;
2706     // Evaluate all sub-pel filters irrespective of whether we can use
2707     // them for this frame.
2708     mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
2709                                                           : cm->interp_filter;
2710     x->skip = 0;
2711     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
2712
2713     // Select prediction reference frames.
2714     for (i = 0; i < MAX_MB_PLANE; i++) {
2715       xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
2716       if (comp_pred)
2717         xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
2718     }
2719
2720     for (i = 0; i < TX_MODES; ++i)
2721       tx_cache[i] = INT64_MAX;
2722
2723     if (ref_frame == INTRA_FRAME) {
2724       TX_SIZE uv_tx;
2725       intra_super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
2726                             bsize, tx_cache, best_rd);
2727
2728       if (rate_y == INT_MAX)
2729         continue;
2730
2731       uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize, pd[1].subsampling_x,
2732                                   pd[1].subsampling_y);
2733       if (rate_uv_intra[uv_tx] == INT_MAX) {
2734         choose_intra_uv_mode(cpi, ctx, bsize, uv_tx,
2735                              &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx],
2736                              &dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]);
2737       }
2738
2739       rate_uv = rate_uv_tokenonly[uv_tx];
2740       distortion_uv = dist_uv[uv_tx];
2741       skippable = skippable && skip_uv[uv_tx];
2742       mbmi->uv_mode = mode_uv[uv_tx];
2743
2744       rate2 = rate_y + cpi->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
2745       if (this_mode != DC_PRED && this_mode != TM_PRED)
2746         rate2 += intra_cost_penalty;
2747       distortion2 = distortion_y + distortion_uv;
2748     } else {
2749       this_rd = handle_inter_mode(cpi, x, bsize,
2750                                   tx_cache,
2751                                   &rate2, &distortion2, &skippable,
2752                                   &rate_y, &distortion_y,
2753                                   &rate_uv, &distortion_uv,
2754                                   &disable_skip, frame_mv,
2755                                   mi_row, mi_col,
2756                                   single_newmv, &total_sse, best_rd);
2757       if (this_rd == INT64_MAX)
2758         continue;
2759
2760       compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
2761
2762       if (cm->reference_mode == REFERENCE_MODE_SELECT)
2763         rate2 += compmode_cost;
2764     }
2765
2766     // Estimate the reference frame signaling cost and add it
2767     // to the rolling cost variable.
2768     if (comp_pred) {
2769       rate2 += ref_costs_comp[ref_frame];
2770     } else {
2771       rate2 += ref_costs_single[ref_frame];
2772     }
2773
2774     if (!disable_skip) {
2775       if (skippable) {
2776         vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
2777
2778         // Back out the coefficient coding costs
2779         rate2 -= (rate_y + rate_uv);
2780         // for best yrd calculation
2781         rate_uv = 0;
2782
2783         // Cost the skip mb case
2784         if (skip_prob) {
2785           int prob_skip_cost = vp9_cost_bit(skip_prob, 1);
2786           rate2 += prob_skip_cost;
2787         }
2788       } else if (ref_frame != INTRA_FRAME && !xd->lossless) {
2789         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
2790             RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
2791           // Add in the cost of the no skip flag.
2792           rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
2793         } else {
2794           // FIXME(rbultje) make this work for splitmv also
2795           rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
2796           distortion2 = total_sse;
2797           assert(total_sse >= 0);
2798           rate2 -= (rate_y + rate_uv);
2799           rate_y = 0;
2800           rate_uv = 0;
2801           this_skip2 = 1;
2802         }
2803       } else {
2804         // Add in the cost of the no skip flag.
2805         rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
2806       }
2807
2808       // Calculate the final RD estimate for this mode.
2809       this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
2810     }
2811
2812     if (ref_frame == INTRA_FRAME) {
2813     // Keep record of best intra rd
2814       if (this_rd < best_intra_rd) {
2815         best_intra_rd = this_rd;
2816         best_intra_mode = mbmi->mode;
2817       }
2818     } else {
2819       // Keep record of best inter rd with single reference
2820       if (!comp_pred && !mode_excluded && this_rd < best_inter_rd) {
2821         best_inter_rd = this_rd;
2822         best_inter_ref_frame = ref_frame;
2823       }
2824     }
2825
2826     if (!disable_skip && ref_frame == INTRA_FRAME) {
2827       for (i = 0; i < REFERENCE_MODES; ++i)
2828         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
2829       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
2830         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
2831     }
2832
2833     // Store the respective mode distortions for later use.
2834     if (mode_distortions[this_mode] == -1
2835         || distortion2 < mode_distortions[this_mode]) {
2836       mode_distortions[this_mode] = distortion2;
2837     }
2838
2839     // Did this mode help.. i.e. is it the new best mode
2840     if (this_rd < best_rd || x->skip) {
2841       int max_plane = MAX_MB_PLANE;
2842       if (!mode_excluded) {
2843         // Note index of best mode so far
2844         best_mode_index = mode_index;
2845
2846         if (ref_frame == INTRA_FRAME) {
2847           /* required for left and above block mv */
2848           mbmi->mv[0].as_int = 0;
2849           max_plane = 1;
2850         }
2851
2852         *returnrate = rate2;
2853         *returndistortion = distortion2;
2854         best_rd = this_rd;
2855         best_mbmode = *mbmi;
2856         best_skip2 = this_skip2;
2857         if (!x->select_tx_size)
2858           swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
2859         vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
2860                    sizeof(uint8_t) * ctx->num_4x4_blk);
2861
2862         // TODO(debargha): enhance this test with a better distortion prediction
2863         // based on qp, activity mask and history
2864         if ((mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
2865             (mode_index > MIN_EARLY_TERM_INDEX)) {
2866           const int qstep = xd->plane[0].dequant[1];
2867           // TODO(debargha): Enhance this by specializing for each mode_index
2868           int scale = 4;
2869           if (x->source_variance < UINT_MAX) {
2870             const int var_adjust = (x->source_variance < 16);
2871             scale -= var_adjust;
2872           }
2873           if (ref_frame > INTRA_FRAME &&
2874               distortion2 * scale < qstep * qstep) {
2875             early_term = 1;
2876           }
2877         }
2878       }
2879     }
2880
2881     /* keep record of best compound/single-only prediction */
2882     if (!disable_skip && ref_frame != INTRA_FRAME) {
2883       int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
2884
2885       if (cm->reference_mode == REFERENCE_MODE_SELECT) {
2886         single_rate = rate2 - compmode_cost;
2887         hybrid_rate = rate2;
2888       } else {
2889         single_rate = rate2;
2890         hybrid_rate = rate2 + compmode_cost;
2891       }
2892
2893       single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
2894       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
2895
2896       if (!comp_pred) {
2897         if (single_rd < best_pred_rd[SINGLE_REFERENCE]) {
2898           best_pred_rd[SINGLE_REFERENCE] = single_rd;
2899         }
2900       } else {
2901         if (single_rd < best_pred_rd[COMPOUND_REFERENCE]) {
2902           best_pred_rd[COMPOUND_REFERENCE] = single_rd;
2903         }
2904       }
2905       if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
2906         best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
2907
2908       /* keep record of best filter type */
2909       if (!mode_excluded && cm->interp_filter != BILINEAR) {
2910         int64_t ref = rd_opt->filter_cache[cm->interp_filter == SWITCHABLE ?
2911                               SWITCHABLE_FILTERS : cm->interp_filter];
2912
2913         for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
2914           int64_t adj_rd;
2915           if (ref == INT64_MAX)
2916             adj_rd = 0;
2917           else if (rd_opt->filter_cache[i] == INT64_MAX)
2918             // when early termination is triggered, the encoder does not have
2919             // access to the rate-distortion cost. it only knows that the cost
2920             // should be above the maximum valid value. hence it takes the known
2921             // maximum plus an arbitrary constant as the rate-distortion cost.
2922             adj_rd = rd_opt->mask_filter - ref + 10;
2923           else
2924             adj_rd = rd_opt->filter_cache[i] - ref;
2925
2926           adj_rd += this_rd;
2927           best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
2928         }
2929       }
2930     }
2931
2932     /* keep record of best txfm size */
2933     if (bsize < BLOCK_32X32) {
2934       if (bsize < BLOCK_16X16)
2935         tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8];
2936
2937       tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16];
2938     }
2939     if (!mode_excluded && this_rd != INT64_MAX) {
2940       for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) {
2941         int64_t adj_rd = INT64_MAX;
2942         adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode];
2943
2944         if (adj_rd < best_tx_rd[i])
2945           best_tx_rd[i] = adj_rd;
2946       }
2947     }
2948
2949     if (early_term)
2950       break;
2951
2952     if (x->skip && !comp_pred)
2953       break;
2954   }
2955
2956   if (best_mode_index < 0 || best_rd >= best_rd_so_far)
2957     return INT64_MAX;
2958
2959   // If we used an estimate for the uv intra rd in the loop above...
2960   if (cpi->sf.use_uv_intra_rd_estimate) {
2961     // Do Intra UV best rd mode selection if best mode choice above was intra.
2962     if (vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME) {
2963       TX_SIZE uv_tx_size;
2964       *mbmi = best_mbmode;
2965       uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]);
2966       rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
2967                               &rate_uv_tokenonly[uv_tx_size],
2968                               &dist_uv[uv_tx_size],
2969                               &skip_uv[uv_tx_size],
2970                               bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize,
2971                               uv_tx_size);
2972     }
2973   }
2974
2975   assert((cm->interp_filter == SWITCHABLE) ||
2976          (cm->interp_filter == best_mbmode.interp_filter) ||
2977          !is_inter_block(&best_mbmode));
2978
2979   update_rd_thresh_fact(cpi, bsize, best_mode_index);
2980
2981   // macroblock modes
2982   *mbmi = best_mbmode;
2983   x->skip |= best_skip2;
2984
2985   for (i = 0; i < REFERENCE_MODES; ++i) {
2986     if (best_pred_rd[i] == INT64_MAX)
2987       best_pred_diff[i] = INT_MIN;
2988     else
2989       best_pred_diff[i] = best_rd - best_pred_rd[i];
2990   }
2991
2992   if (!x->skip) {
2993     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
2994       if (best_filter_rd[i] == INT64_MAX)
2995         best_filter_diff[i] = 0;
2996       else
2997         best_filter_diff[i] = best_rd - best_filter_rd[i];
2998     }
2999     if (cm->interp_filter == SWITCHABLE)
3000       assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
3001     for (i = 0; i < TX_MODES; i++) {
3002       if (best_tx_rd[i] == INT64_MAX)
3003         best_tx_diff[i] = 0;
3004       else
3005         best_tx_diff[i] = best_rd - best_tx_rd[i];
3006     }
3007   } else {
3008     vp9_zero(best_filter_diff);
3009     vp9_zero(best_tx_diff);
3010   }
3011
3012   set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
3013   store_coding_context(x, ctx, best_mode_index,
3014                        best_pred_diff, best_tx_diff, best_filter_diff);
3015
3016   return best_rd;
3017 }
3018
3019 int64_t vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x,
3020                                            int *returnrate,
3021                                            int64_t *returndistortion,
3022                                            BLOCK_SIZE bsize,
3023                                            PICK_MODE_CONTEXT *ctx,
3024                                            int64_t best_rd_so_far) {
3025   VP9_COMMON *const cm = &cpi->common;
3026   RD_OPT *const rd_opt = &cpi->rd;
3027   MACROBLOCKD *const xd = &x->e_mbd;
3028   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
3029   unsigned char segment_id = mbmi->segment_id;
3030   const int comp_pred = 0;
3031   int i;
3032   int64_t best_tx_diff[TX_MODES];
3033   int64_t best_pred_diff[REFERENCE_MODES];
3034   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
3035   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
3036   vp9_prob comp_mode_p;
3037   INTERP_FILTER best_filter = SWITCHABLE;
3038   int64_t this_rd = INT64_MAX;
3039   int rate2 = 0;
3040   const int64_t distortion2 = 0;
3041
3042   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
3043
3044   estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
3045                            &comp_mode_p);
3046
3047   for (i = 0; i < MAX_REF_FRAMES; ++i)
3048     x->pred_sse[i] = INT_MAX;
3049   for (i = LAST_FRAME; i < MAX_REF_FRAMES; ++i)
3050     x->pred_mv_sad[i] = INT_MAX;
3051
3052   *returnrate = INT_MAX;
3053
3054   assert(vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
3055
3056   mbmi->mode = ZEROMV;
3057   mbmi->uv_mode = DC_PRED;
3058   mbmi->ref_frame[0] = LAST_FRAME;
3059   mbmi->ref_frame[1] = NONE;
3060   mbmi->mv[0].as_int = 0;
3061   x->skip = 1;
3062
3063   // Search for best switchable filter by checking the variance of
3064   // pred error irrespective of whether the filter will be used
3065   rd_opt->mask_filter = 0;
3066   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
3067     rd_opt->filter_cache[i] = INT64_MAX;
3068
3069   if (cm->interp_filter != BILINEAR) {
3070     best_filter = EIGHTTAP;
3071     if (cm->interp_filter == SWITCHABLE &&
3072         x->source_variance >= cpi->sf.disable_filter_search_var_thresh) {
3073       int rs;
3074       int best_rs = INT_MAX;
3075       for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
3076         mbmi->interp_filter = i;
3077         rs = vp9_get_switchable_rate(cpi);
3078         if (rs < best_rs) {
3079           best_rs = rs;
3080           best_filter = mbmi->interp_filter;
3081         }
3082       }
3083     }
3084   }
3085   // Set the appropriate filter
3086   if (cm->interp_filter == SWITCHABLE) {
3087     mbmi->interp_filter = best_filter;
3088     rate2 += vp9_get_switchable_rate(cpi);
3089   } else {
3090     mbmi->interp_filter = cm->interp_filter;
3091   }
3092
3093   if (cm->reference_mode == REFERENCE_MODE_SELECT)
3094     rate2 += vp9_cost_bit(comp_mode_p, comp_pred);
3095
3096   // Estimate the reference frame signaling cost and add it
3097   // to the rolling cost variable.
3098   rate2 += ref_costs_single[LAST_FRAME];
3099   this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
3100
3101   *returnrate = rate2;
3102   *returndistortion = distortion2;
3103
3104   if (this_rd >= best_rd_so_far)
3105     return INT64_MAX;
3106
3107   assert((cm->interp_filter == SWITCHABLE) ||
3108          (cm->interp_filter == mbmi->interp_filter));
3109
3110   update_rd_thresh_fact(cpi, bsize, THR_ZEROMV);
3111
3112   vp9_zero(best_pred_diff);
3113   vp9_zero(best_filter_diff);
3114   vp9_zero(best_tx_diff);
3115
3116   if (!x->select_tx_size)
3117     swap_block_ptr(x, ctx, 1, 0, 0, MAX_MB_PLANE);
3118   store_coding_context(x, ctx, THR_ZEROMV,
3119                        best_pred_diff, best_tx_diff, best_filter_diff);
3120
3121   return this_rd;
3122 }
3123
3124 int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
3125                                       const TileInfo *const tile,
3126                                       int mi_row, int mi_col,
3127                                       int *returnrate,
3128                                       int64_t *returndistortion,
3129                                       BLOCK_SIZE bsize,
3130                                       PICK_MODE_CONTEXT *ctx,
3131                                       int64_t best_rd_so_far) {
3132   VP9_COMMON *const cm = &cpi->common;
3133   RD_OPT *const rd_opt = &cpi->rd;
3134   MACROBLOCKD *const xd = &x->e_mbd;
3135   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
3136   const struct segmentation *const seg = &cm->seg;
3137   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
3138   unsigned char segment_id = mbmi->segment_id;
3139   int comp_pred, i;
3140   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
3141   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
3142   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
3143                                     VP9_ALT_FLAG };
3144   int64_t best_rd = best_rd_so_far;
3145   int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
3146   static const int64_t best_tx_diff[TX_MODES] = { 0 };
3147   int64_t best_pred_diff[REFERENCE_MODES];
3148   int64_t best_pred_rd[REFERENCE_MODES];
3149   int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
3150   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
3151   MB_MODE_INFO best_mbmode;
3152   int ref_index, best_ref_index = 0;
3153   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
3154   vp9_prob comp_mode_p;
3155   int64_t best_inter_rd = INT64_MAX;
3156   MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
3157   INTERP_FILTER tmp_best_filter = SWITCHABLE;
3158   int rate_uv_intra, rate_uv_tokenonly;
3159   int64_t dist_uv;
3160   int skip_uv;
3161   PREDICTION_MODE mode_uv = DC_PRED;
3162   int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q);
3163   int_mv seg_mvs[4][MAX_REF_FRAMES];
3164   b_mode_info best_bmodes[4];
3165   int best_skip2 = 0;
3166   int mode_skip_mask = 0;
3167
3168   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
3169   vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4);
3170   vp9_zero(best_mbmode);
3171
3172   for (i = 0; i < 4; i++) {
3173     int j;
3174     for (j = 0; j < MAX_REF_FRAMES; j++)
3175       seg_mvs[i][j].as_int = INVALID_MV;
3176   }
3177
3178   estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
3179                            &comp_mode_p);
3180
3181   for (i = 0; i < REFERENCE_MODES; ++i)
3182     best_pred_rd[i] = INT64_MAX;
3183   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
3184     best_filter_rd[i] = INT64_MAX;
3185   rate_uv_intra = INT_MAX;
3186
3187   *returnrate = INT_MAX;
3188
3189   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
3190     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
3191       setup_buffer_inter(cpi, x, tile,
3192                              ref_frame, bsize, mi_row, mi_col,
3193                              frame_mv[NEARESTMV], frame_mv[NEARMV],
3194                              yv12_mb);
3195     }
3196     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
3197     frame_mv[ZEROMV][ref_frame].as_int = 0;
3198   }
3199
3200   for (ref_index = 0; ref_index < MAX_REFS; ++ref_index) {
3201     int mode_excluded = 0;
3202     int64_t this_rd = INT64_MAX;
3203     int disable_skip = 0;
3204     int compmode_cost = 0;
3205     int rate2 = 0, rate_y = 0, rate_uv = 0;
3206     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
3207     int skippable = 0;
3208     int i;
3209     int this_skip2 = 0;
3210     int64_t total_sse = INT_MAX;
3211     int early_term = 0;
3212
3213     ref_frame = vp9_ref_order[ref_index].ref_frame[0];
3214     second_ref_frame = vp9_ref_order[ref_index].ref_frame[1];
3215
3216     // Look at the reference frame of the best mode so far and set the
3217     // skip mask to look at a subset of the remaining modes.
3218     if (ref_index > 2 && cpi->sf.mode_skip_start < MAX_MODES) {
3219       if (ref_index == 3) {
3220         switch (vp9_ref_order[best_ref_index].ref_frame[0]) {
3221           case INTRA_FRAME:
3222             mode_skip_mask = 0;
3223             break;
3224           case LAST_FRAME:
3225             mode_skip_mask = 0x0010;
3226             break;
3227           case GOLDEN_FRAME:
3228             mode_skip_mask = 0x0008;
3229             break;
3230           case ALTREF_FRAME:
3231             mode_skip_mask = 0x0000;
3232             break;
3233           case NONE:
3234           case MAX_REF_FRAMES:
3235             assert(0 && "Invalid Reference frame");
3236         }
3237       }
3238       if (mode_skip_mask & (1 << ref_index))
3239         continue;
3240     }
3241
3242     // Test best rd so far against threshold for trying this mode.
3243     if (rd_less_than_thresh(best_rd,
3244                             rd_opt->threshes[segment_id][bsize][ref_index],
3245                             rd_opt->thresh_freq_fact[bsize][ref_index]))
3246       continue;
3247
3248     if (ref_frame > INTRA_FRAME &&
3249         !(cpi->ref_frame_flags & flag_list[ref_frame])) {
3250       continue;
3251     }
3252
3253     comp_pred = second_ref_frame > INTRA_FRAME;
3254     if (comp_pred) {
3255       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
3256         continue;
3257       // Do not allow compound prediction if the segment level reference frame
3258       // feature is in use as in this case there can only be one reference.
3259       if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
3260         continue;
3261       if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
3262           vp9_ref_order[best_ref_index].ref_frame[0] == INTRA_FRAME)
3263         continue;
3264       if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH) &&
3265           ref_frame != best_inter_ref_frame &&
3266           second_ref_frame != best_inter_ref_frame)
3267         continue;
3268     }
3269
3270     // TODO(jingning, jkoleszar): scaling reference frame not supported for
3271     // sub8x8 blocks.
3272     if (ref_frame > INTRA_FRAME &&
3273         vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf))
3274       continue;
3275
3276     if (second_ref_frame > INTRA_FRAME &&
3277         vp9_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf))
3278       continue;
3279
3280     if (comp_pred) {
3281       mode_excluded = mode_excluded ? mode_excluded
3282                                     : cm->reference_mode == SINGLE_REFERENCE;
3283     } else if (ref_frame != INTRA_FRAME) {
3284       mode_excluded = mode_excluded ? mode_excluded
3285                                     : cm->reference_mode == COMPOUND_REFERENCE;
3286     }
3287
3288     // If the segment reference frame feature is enabled....
3289     // then do nothing if the current ref frame is not allowed..
3290     if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
3291         vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) !=
3292             (int)ref_frame) {
3293       continue;
3294     // Disable this drop out case if the ref frame
3295     // segment level feature is enabled for this segment. This is to
3296     // prevent the possibility that we end up unable to pick any mode.
3297     } else if (!vp9_segfeature_active(seg, segment_id,
3298                                       SEG_LVL_REF_FRAME)) {
3299       // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
3300       // unless ARNR filtering is enabled in which case we want
3301       // an unfiltered alternative. We allow near/nearest as well
3302       // because they may result in zero-zero MVs but be cheaper.
3303       if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
3304         continue;
3305     }
3306
3307     mbmi->tx_size = TX_4X4;
3308     mbmi->uv_mode = DC_PRED;
3309     mbmi->ref_frame[0] = ref_frame;
3310     mbmi->ref_frame[1] = second_ref_frame;
3311     // Evaluate all sub-pel filters irrespective of whether we can use
3312     // them for this frame.
3313     mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
3314                                                           : cm->interp_filter;
3315     x->skip = 0;
3316     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
3317
3318     // Select prediction reference frames.
3319     for (i = 0; i < MAX_MB_PLANE; i++) {
3320       xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
3321       if (comp_pred)
3322         xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
3323     }
3324
3325     if (ref_frame == INTRA_FRAME) {
3326       int rate;
3327       if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y,
3328                                        &distortion_y, best_rd) >= best_rd)
3329         continue;
3330       rate2 += rate;
3331       rate2 += intra_cost_penalty;
3332       distortion2 += distortion_y;
3333
3334       if (rate_uv_intra == INT_MAX) {
3335         choose_intra_uv_mode(cpi, ctx, bsize, TX_4X4,
3336                              &rate_uv_intra,
3337                              &rate_uv_tokenonly,
3338                              &dist_uv, &skip_uv,
3339                              &mode_uv);
3340       }
3341       rate2 += rate_uv_intra;
3342       rate_uv = rate_uv_tokenonly;
3343       distortion2 += dist_uv;
3344       distortion_uv = dist_uv;
3345       mbmi->uv_mode = mode_uv;
3346     } else {
3347       int rate;
3348       int64_t distortion;
3349       int64_t this_rd_thresh;
3350       int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
3351       int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
3352       int64_t tmp_best_distortion = INT_MAX, tmp_best_sse, uv_sse;
3353       int tmp_best_skippable = 0;
3354       int switchable_filter_index;
3355       int_mv *second_ref = comp_pred ?
3356                              &mbmi->ref_mvs[second_ref_frame][0] : NULL;
3357       b_mode_info tmp_best_bmodes[16];
3358       MB_MODE_INFO tmp_best_mbmode;
3359       BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
3360       int pred_exists = 0;
3361       int uv_skippable;
3362
3363       this_rd_thresh = (ref_frame == LAST_FRAME) ?
3364           rd_opt->threshes[segment_id][bsize][THR_LAST] :
3365           rd_opt->threshes[segment_id][bsize][THR_ALTR];
3366       this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
3367       rd_opt->threshes[segment_id][bsize][THR_GOLD] : this_rd_thresh;
3368       rd_opt->mask_filter = 0;
3369       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
3370         rd_opt->filter_cache[i] = INT64_MAX;
3371
3372       if (cm->interp_filter != BILINEAR) {
3373         tmp_best_filter = EIGHTTAP;
3374         if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
3375           tmp_best_filter = EIGHTTAP;
3376         } else if (cpi->sf.adaptive_pred_interp_filter == 1 &&
3377                    ctx->pred_interp_filter < SWITCHABLE) {
3378           tmp_best_filter = ctx->pred_interp_filter;
3379         } else if (cpi->sf.adaptive_pred_interp_filter == 2) {
3380           tmp_best_filter = ctx->pred_interp_filter < SWITCHABLE ?
3381                               ctx->pred_interp_filter : 0;
3382         } else {
3383           for (switchable_filter_index = 0;
3384                switchable_filter_index < SWITCHABLE_FILTERS;
3385                ++switchable_filter_index) {
3386             int newbest, rs;
3387             int64_t rs_rd;
3388             mbmi->interp_filter = switchable_filter_index;
3389             tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile,
3390                                               &mbmi->ref_mvs[ref_frame][0],
3391                                               second_ref, best_yrd, &rate,
3392                                               &rate_y, &distortion,
3393                                               &skippable, &total_sse,
3394                                               (int) this_rd_thresh, seg_mvs,
3395                                               bsi, switchable_filter_index,
3396                                               mi_row, mi_col);
3397
3398             if (tmp_rd == INT64_MAX)
3399               continue;
3400             rs = vp9_get_switchable_rate(cpi);
3401             rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
3402             rd_opt->filter_cache[switchable_filter_index] = tmp_rd;
3403             rd_opt->filter_cache[SWITCHABLE_FILTERS] =
3404                 MIN(rd_opt->filter_cache[SWITCHABLE_FILTERS],
3405                     tmp_rd + rs_rd);
3406             if (cm->interp_filter == SWITCHABLE)
3407               tmp_rd += rs_rd;
3408
3409             rd_opt->mask_filter = MAX(rd_opt->mask_filter, tmp_rd);
3410
3411             newbest = (tmp_rd < tmp_best_rd);
3412             if (newbest) {
3413               tmp_best_filter = mbmi->interp_filter;
3414               tmp_best_rd = tmp_rd;
3415             }
3416             if ((newbest && cm->interp_filter == SWITCHABLE) ||
3417                 (mbmi->interp_filter == cm->interp_filter &&
3418                  cm->interp_filter != SWITCHABLE)) {
3419               tmp_best_rdu = tmp_rd;
3420               tmp_best_rate = rate;
3421               tmp_best_ratey = rate_y;
3422               tmp_best_distortion = distortion;
3423               tmp_best_sse = total_sse;
3424               tmp_best_skippable = skippable;
3425               tmp_best_mbmode = *mbmi;
3426               for (i = 0; i < 4; i++) {
3427                 tmp_best_bmodes[i] = xd->mi[0]->bmi[i];
3428                 x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i];
3429               }
3430               pred_exists = 1;
3431               if (switchable_filter_index == 0 &&
3432                   cpi->sf.use_rd_breakout &&
3433                   best_rd < INT64_MAX) {
3434                 if (tmp_best_rdu / 2 > best_rd) {
3435                   // skip searching the other filters if the first is
3436                   // already substantially larger than the best so far
3437                   tmp_best_filter = mbmi->interp_filter;
3438                   tmp_best_rdu = INT64_MAX;
3439                   break;
3440                 }
3441               }
3442             }
3443           }  // switchable_filter_index loop
3444         }
3445       }
3446
3447       if (tmp_best_rdu == INT64_MAX && pred_exists)
3448         continue;
3449
3450       mbmi->interp_filter = (cm->interp_filter == SWITCHABLE ?
3451                              tmp_best_filter : cm->interp_filter);
3452       if (!pred_exists) {
3453         // Handles the special case when a filter that is not in the
3454         // switchable list (bilinear, 6-tap) is indicated at the frame level
3455         tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile,
3456                                           &mbmi->ref_mvs[ref_frame][0],
3457                                           second_ref, best_yrd, &rate, &rate_y,
3458                                           &distortion, &skippable, &total_sse,
3459                                           (int) this_rd_thresh, seg_mvs, bsi, 0,
3460                                           mi_row, mi_col);
3461         if (tmp_rd == INT64_MAX)
3462           continue;
3463       } else {
3464         total_sse = tmp_best_sse;
3465         rate = tmp_best_rate;
3466         rate_y = tmp_best_ratey;
3467         distortion = tmp_best_distortion;
3468         skippable = tmp_best_skippable;
3469         *mbmi = tmp_best_mbmode;
3470         for (i = 0; i < 4; i++)
3471           xd->mi[0]->bmi[i] = tmp_best_bmodes[i];
3472       }
3473
3474       rate2 += rate;
3475       distortion2 += distortion;
3476
3477       if (cm->interp_filter == SWITCHABLE)
3478         rate2 += vp9_get_switchable_rate(cpi);
3479
3480       if (!mode_excluded)
3481         mode_excluded = comp_pred ? cm->reference_mode == SINGLE_REFERENCE
3482                                   : cm->reference_mode == COMPOUND_REFERENCE;
3483
3484       compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
3485
3486       tmp_best_rdu = best_rd -
3487           MIN(RDCOST(x->rdmult, x->rddiv, rate2, distortion2),
3488               RDCOST(x->rdmult, x->rddiv, 0, total_sse));
3489
3490       if (tmp_best_rdu > 0) {
3491         // If even the 'Y' rd value of split is higher than best so far
3492         // then dont bother looking at UV
3493         vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
3494                                         BLOCK_8X8);
3495         super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
3496                          &uv_sse, BLOCK_8X8, tmp_best_rdu);
3497         if (rate_uv == INT_MAX)
3498           continue;
3499         rate2 += rate_uv;
3500         distortion2 += distortion_uv;
3501         skippable = skippable && uv_skippable;
3502         total_sse += uv_sse;
3503       }
3504     }
3505
3506     if (cm->reference_mode == REFERENCE_MODE_SELECT)
3507       rate2 += compmode_cost;
3508
3509     // Estimate the reference frame signaling cost and add it
3510     // to the rolling cost variable.
3511     if (second_ref_frame > INTRA_FRAME) {
3512       rate2 += ref_costs_comp[ref_frame];
3513     } else {
3514       rate2 += ref_costs_single[ref_frame];
3515     }
3516
3517     if (!disable_skip) {
3518       // Skip is never coded at the segment level for sub8x8 blocks and instead
3519       // always coded in the bitstream at the mode info level.
3520
3521       if (ref_frame != INTRA_FRAME && !xd->lossless) {
3522         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
3523             RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
3524           // Add in the cost of the no skip flag.
3525           rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
3526         } else {
3527           // FIXME(rbultje) make this work for splitmv also
3528           rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
3529           distortion2 = total_sse;
3530           assert(total_sse >= 0);
3531           rate2 -= (rate_y + rate_uv);
3532           rate_y = 0;
3533           rate_uv = 0;
3534           this_skip2 = 1;
3535         }
3536       } else {
3537         // Add in the cost of the no skip flag.
3538         rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
3539       }
3540
3541       // Calculate the final RD estimate for this mode.
3542       this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
3543     }
3544
3545     // Keep record of best inter rd with single reference
3546     if (is_inter_block(mbmi) &&
3547         !has_second_ref(mbmi) &&
3548         !mode_excluded &&
3549         this_rd < best_inter_rd) {
3550       best_inter_rd = this_rd;
3551       best_inter_ref_frame = ref_frame;
3552     }
3553
3554     if (!disable_skip && ref_frame == INTRA_FRAME) {
3555       for (i = 0; i < REFERENCE_MODES; ++i)
3556         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
3557       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
3558         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
3559     }
3560
3561     // Did this mode help.. i.e. is it the new best mode
3562     if (this_rd < best_rd || x->skip) {
3563       if (!mode_excluded) {
3564         int max_plane = MAX_MB_PLANE;
3565         // Note index of best mode so far
3566         best_ref_index = ref_index;
3567
3568         if (ref_frame == INTRA_FRAME) {
3569           /* required for left and above block mv */
3570           mbmi->mv[0].as_int = 0;
3571           max_plane = 1;
3572         }
3573
3574         *returnrate = rate2;
3575         *returndistortion = distortion2;
3576         best_rd = this_rd;
3577         best_yrd = best_rd -
3578                    RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
3579         best_mbmode = *mbmi;
3580         best_skip2 = this_skip2;
3581         if (!x->select_tx_size)
3582           swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
3583         vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[TX_4X4],
3584                    sizeof(uint8_t) * ctx->num_4x4_blk);
3585
3586         for (i = 0; i < 4; i++)
3587           best_bmodes[i] = xd->mi[0]->bmi[i];
3588
3589         // TODO(debargha): enhance this test with a better distortion prediction
3590         // based on qp, activity mask and history
3591         if ((cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
3592             (ref_index > MIN_EARLY_TERM_INDEX)) {
3593           const int qstep = xd->plane[0].dequant[1];
3594           // TODO(debargha): Enhance this by specializing for each mode_index
3595           int scale = 4;
3596           if (x->source_variance < UINT_MAX) {
3597             const int var_adjust = (x->source_variance < 16);
3598             scale -= var_adjust;
3599           }
3600           if (ref_frame > INTRA_FRAME &&
3601               distortion2 * scale < qstep * qstep) {
3602             early_term = 1;
3603           }
3604         }
3605       }
3606     }
3607
3608     /* keep record of best compound/single-only prediction */
3609     if (!disable_skip && ref_frame != INTRA_FRAME) {
3610       int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
3611
3612       if (cm->reference_mode == REFERENCE_MODE_SELECT) {
3613         single_rate = rate2 - compmode_cost;
3614         hybrid_rate = rate2;
3615       } else {
3616         single_rate = rate2;
3617         hybrid_rate = rate2 + compmode_cost;
3618       }
3619
3620       single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
3621       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
3622
3623       if (!comp_pred && single_rd < best_pred_rd[SINGLE_REFERENCE]) {
3624         best_pred_rd[SINGLE_REFERENCE] = single_rd;
3625       } else if (comp_pred && single_rd < best_pred_rd[COMPOUND_REFERENCE]) {
3626         best_pred_rd[COMPOUND_REFERENCE] = single_rd;
3627       }
3628       if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
3629         best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
3630     }
3631
3632     /* keep record of best filter type */
3633     if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME &&
3634         cm->interp_filter != BILINEAR) {
3635       int64_t ref = rd_opt->filter_cache[cm->interp_filter == SWITCHABLE ?
3636                               SWITCHABLE_FILTERS : cm->interp_filter];
3637       int64_t adj_rd;
3638       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
3639         if (ref == INT64_MAX)
3640           adj_rd = 0;
3641         else if (rd_opt->filter_cache[i] == INT64_MAX)
3642           // when early termination is triggered, the encoder does not have
3643           // access to the rate-distortion cost. it only knows that the cost
3644           // should be above the maximum valid value. hence it takes the known
3645           // maximum plus an arbitrary constant as the rate-distortion cost.
3646           adj_rd = rd_opt->mask_filter - ref + 10;
3647         else
3648           adj_rd = rd_opt->filter_cache[i] - ref;
3649
3650         adj_rd += this_rd;
3651         best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
3652       }
3653     }
3654
3655     if (early_term)
3656       break;
3657
3658     if (x->skip && !comp_pred)
3659       break;
3660   }
3661
3662   if (best_rd >= best_rd_so_far)
3663     return INT64_MAX;
3664
3665   // If we used an estimate for the uv intra rd in the loop above...
3666   if (cpi->sf.use_uv_intra_rd_estimate) {
3667     // Do Intra UV best rd mode selection if best mode choice above was intra.
3668     if (vp9_ref_order[best_ref_index].ref_frame[0] == INTRA_FRAME) {
3669       *mbmi = best_mbmode;
3670       rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra,
3671                               &rate_uv_tokenonly,
3672                               &dist_uv,
3673                               &skip_uv,
3674                               BLOCK_8X8, TX_4X4);
3675     }
3676   }
3677
3678   if (best_rd == INT64_MAX) {
3679     *returnrate = INT_MAX;
3680     *returndistortion = INT64_MAX;
3681     return best_rd;
3682   }
3683
3684   assert((cm->interp_filter == SWITCHABLE) ||
3685          (cm->interp_filter == best_mbmode.interp_filter) ||
3686          !is_inter_block(&best_mbmode));
3687
3688   update_rd_thresh_fact(cpi, bsize, best_ref_index);
3689
3690   // macroblock modes
3691   *mbmi = best_mbmode;
3692   x->skip |= best_skip2;
3693   if (!is_inter_block(&best_mbmode)) {
3694     for (i = 0; i < 4; i++)
3695       xd->mi[0]->bmi[i].as_mode = best_bmodes[i].as_mode;
3696   } else {
3697     for (i = 0; i < 4; ++i)
3698       vpx_memcpy(&xd->mi[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info));
3699
3700     mbmi->mv[0].as_int = xd->mi[0]->bmi[3].as_mv[0].as_int;
3701     mbmi->mv[1].as_int = xd->mi[0]->bmi[3].as_mv[1].as_int;
3702   }
3703
3704   for (i = 0; i < REFERENCE_MODES; ++i) {
3705     if (best_pred_rd[i] == INT64_MAX)
3706       best_pred_diff[i] = INT_MIN;
3707     else
3708       best_pred_diff[i] = best_rd - best_pred_rd[i];
3709   }
3710
3711   if (!x->skip) {
3712     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
3713       if (best_filter_rd[i] == INT64_MAX)
3714         best_filter_diff[i] = 0;
3715       else
3716         best_filter_diff[i] = best_rd - best_filter_rd[i];
3717     }
3718     if (cm->interp_filter == SWITCHABLE)
3719       assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
3720   } else {
3721     vp9_zero(best_filter_diff);
3722   }
3723
3724   set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
3725   store_coding_context(x, ctx, best_ref_index,
3726                        best_pred_diff, best_tx_diff, best_filter_diff);
3727
3728   return best_rd;
3729 }