src/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.c

   1 /*
   2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include <assert.h>
  12 #include <limits.h>
  13 #include <math.h>
  14 #include <stdio.h>
  15
  16 #include "./vp9_rtcd.h"
  17
  18 #include "vpx_mem/vpx_mem.h"
  19
  20 #include "vp9/common/vp9_common.h"
  21 #include "vp9/common/vp9_entropy.h"
  22 #include "vp9/common/vp9_entropymode.h"
  23 #include "vp9/common/vp9_idct.h"
  24 #include "vp9/common/vp9_mvref_common.h"
  25 #include "vp9/common/vp9_pragmas.h"
  26 #include "vp9/common/vp9_pred_common.h"
  27 #include "vp9/common/vp9_quant_common.h"
  28 #include "vp9/common/vp9_reconinter.h"
  29 #include "vp9/common/vp9_reconintra.h"
  30 #include "vp9/common/vp9_seg_common.h"
  31 #include "vp9/common/vp9_systemdependent.h"
  32
  33 #include "vp9/encoder/vp9_cost.h"
  34 #include "vp9/encoder/vp9_encodemb.h"
  35 #include "vp9/encoder/vp9_encodemv.h"
  36 #include "vp9/encoder/vp9_encoder.h"
  37 #include "vp9/encoder/vp9_mcomp.h"
  38 #include "vp9/encoder/vp9_quantize.h"
  39 #include "vp9/encoder/vp9_ratectrl.h"
  40 #include "vp9/encoder/vp9_rdopt.h"
  41 #include "vp9/encoder/vp9_tokenize.h"
  42 #include "vp9/encoder/vp9_variance.h"
  43
  44 #define RD_THRESH_MAX_FACT 64
  45 #define RD_THRESH_INC      1
  46 #define RD_THRESH_POW      1.25
  47 #define RD_MULT_EPB_RATIO  64
  48
  49 /* Factor to weigh the rate for switchable interp filters */
  50 #define SWITCHABLE_INTERP_RATE_FACTOR 1
  51
  52 #define LAST_FRAME_MODE_MASK    0xFFEDCD60
  53 #define GOLDEN_FRAME_MODE_MASK  0xFFDA3BB0
  54 #define ALT_REF_MODE_MASK       0xFFC648D0
  55
  56 #define MIN_EARLY_TERM_INDEX    3
  57
  58 typedef struct {
  59   PREDICTION_MODE mode;
  60   MV_REFERENCE_FRAME ref_frame[2];
  61 } MODE_DEFINITION;
  62
  63 typedef struct {
  64   MV_REFERENCE_FRAME ref_frame[2];
  65 } REF_DEFINITION;
  66
  67 struct rdcost_block_args {
  68   MACROBLOCK *x;
  69   ENTROPY_CONTEXT t_above[16];
  70   ENTROPY_CONTEXT t_left[16];
  71   int rate;
  72   int64_t dist;
  73   int64_t sse;
  74   int this_rate;
  75   int64_t this_dist;
  76   int64_t this_sse;
  77   int64_t this_rd;
  78   int64_t best_rd;
  79   int skip;
  80   int use_fast_coef_costing;
  81   const scan_order *so;
  82 };
  83
  84 static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
  85   {NEARESTMV, {LAST_FRAME,   NONE}},
  86   {NEARESTMV, {ALTREF_FRAME, NONE}},
  87   {NEARESTMV, {GOLDEN_FRAME, NONE}},
  88
  89   {DC_PRED,   {INTRA_FRAME,  NONE}},
  90
  91   {NEWMV,     {LAST_FRAME,   NONE}},
  92   {NEWMV,     {ALTREF_FRAME, NONE}},
  93   {NEWMV,     {GOLDEN_FRAME, NONE}},
  94
  95   {NEARMV,    {LAST_FRAME,   NONE}},
  96   {NEARMV,    {ALTREF_FRAME, NONE}},
  97   {NEARESTMV, {LAST_FRAME,   ALTREF_FRAME}},
  98   {NEARESTMV, {GOLDEN_FRAME, ALTREF_FRAME}},
  99
 100   {TM_PRED,   {INTRA_FRAME,  NONE}},
 101
 102   {NEARMV,    {LAST_FRAME,   ALTREF_FRAME}},
 103   {NEWMV,     {LAST_FRAME,   ALTREF_FRAME}},
 104   {NEARMV,    {GOLDEN_FRAME, NONE}},
 105   {NEARMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
 106   {NEWMV,     {GOLDEN_FRAME, ALTREF_FRAME}},
 107
 108   {ZEROMV,    {LAST_FRAME,   NONE}},
 109   {ZEROMV,    {GOLDEN_FRAME, NONE}},
 110   {ZEROMV,    {ALTREF_FRAME, NONE}},
 111   {ZEROMV,    {LAST_FRAME,   ALTREF_FRAME}},
 112   {ZEROMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
 113
 114   {H_PRED,    {INTRA_FRAME,  NONE}},
 115   {V_PRED,    {INTRA_FRAME,  NONE}},
 116   {D135_PRED, {INTRA_FRAME,  NONE}},
 117   {D207_PRED, {INTRA_FRAME,  NONE}},
 118   {D153_PRED, {INTRA_FRAME,  NONE}},
 119   {D63_PRED,  {INTRA_FRAME,  NONE}},
 120   {D117_PRED, {INTRA_FRAME,  NONE}},
 121   {D45_PRED,  {INTRA_FRAME,  NONE}},
 122 };
 123
 124 static const REF_DEFINITION vp9_ref_order[MAX_REFS] = {
 125   {{LAST_FRAME,   NONE}},
 126   {{GOLDEN_FRAME, NONE}},
 127   {{ALTREF_FRAME, NONE}},
 128   {{LAST_FRAME,   ALTREF_FRAME}},
 129   {{GOLDEN_FRAME, ALTREF_FRAME}},
 130   {{INTRA_FRAME,  NONE}},
 131 };
 132
 133 // The baseline rd thresholds for breaking out of the rd loop for
 134 // certain modes are assumed to be based on 8x8 blocks.
 135 // This table is used to correct for blocks size.
 136 // The factors here are << 2 (2 = x0.5, 32 = x8 etc).
 137 static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES] = {
 138   2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32
 139 };
 140
 141 static int raster_block_offset(BLOCK_SIZE plane_bsize,
 142                                int raster_block, int stride) {
 143   const int bw = b_width_log2(plane_bsize);
 144   const int y = 4 * (raster_block >> bw);
 145   const int x = 4 * (raster_block & ((1 << bw) - 1));
 146   return y * stride + x;
 147 }
 148 static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize,
 149                                           int raster_block, int16_t *base) {
 150   const int stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
 151   return base + raster_block_offset(plane_bsize, raster_block, stride);
 152 }
 153
 154 static void fill_mode_costs(VP9_COMP *cpi) {
 155   const FRAME_CONTEXT *const fc = &cpi->common.fc;
 156   int i, j;
 157
 158   for (i = 0; i < INTRA_MODES; i++)
 159     for (j = 0; j < INTRA_MODES; j++)
 160       vp9_cost_tokens(cpi->y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
 161                       vp9_intra_mode_tree);
 162
 163   // TODO(rbultje) separate tables for superblock costing?
 164   vp9_cost_tokens(cpi->mbmode_cost, fc->y_mode_prob[1], vp9_intra_mode_tree);
 165   vp9_cost_tokens(cpi->intra_uv_mode_cost[KEY_FRAME],
 166                   vp9_kf_uv_mode_prob[TM_PRED], vp9_intra_mode_tree);
 167   vp9_cost_tokens(cpi->intra_uv_mode_cost[INTER_FRAME],
 168                   fc->uv_mode_prob[TM_PRED], vp9_intra_mode_tree);
 169
 170   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
 171     vp9_cost_tokens(cpi->switchable_interp_costs[i],
 172                     fc->switchable_interp_prob[i], vp9_switchable_interp_tree);
 173 }
 174
 175 static void fill_token_costs(vp9_coeff_cost *c,
 176                              vp9_coeff_probs_model (*p)[PLANE_TYPES]) {
 177   int i, j, k, l;
 178   TX_SIZE t;
 179   for (t = TX_4X4; t <= TX_32X32; ++t)
 180     for (i = 0; i < PLANE_TYPES; ++i)
 181       for (j = 0; j < REF_TYPES; ++j)
 182         for (k = 0; k < COEF_BANDS; ++k)
 183           for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
 184             vp9_prob probs[ENTROPY_NODES];
 185             vp9_model_to_full_probs(p[t][i][j][k][l], probs);
 186             vp9_cost_tokens((int *)c[t][i][j][k][0][l], probs,
 187                             vp9_coef_tree);
 188             vp9_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs,
 189                                  vp9_coef_tree);
 190             assert(c[t][i][j][k][0][l][EOB_TOKEN] ==
 191                    c[t][i][j][k][1][l][EOB_TOKEN]);
 192           }
 193 }
 194
 195 static const uint8_t rd_iifactor[32] = {
 196   4, 4, 3, 2, 1, 0, 0, 0,
 197   0, 0, 0, 0, 0, 0, 0, 0,
 198   0, 0, 0, 0, 0, 0, 0, 0,
 199   0, 0, 0, 0, 0, 0, 0, 0,
 200 };
 201
 202 // 3* dc_qlookup[Q]*dc_qlookup[Q];
 203
 204 /* values are now correlated to quantizer */
 205 static int sad_per_bit16lut[QINDEX_RANGE];
 206 static int sad_per_bit4lut[QINDEX_RANGE];
 207
 208 void vp9_init_me_luts() {
 209   int i;
 210
 211   // Initialize the sad lut tables using a formulaic calculation for now
 212   // This is to make it easier to resolve the impact of experimental changes
 213   // to the quantizer tables.
 214   for (i = 0; i < QINDEX_RANGE; i++) {
 215     const double q = vp9_convert_qindex_to_q(i);
 216     sad_per_bit16lut[i] = (int)(0.0418 * q + 2.4107);
 217     sad_per_bit4lut[i] = (int)(0.063 * q + 2.742);
 218   }
 219 }
 220
 221 int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) {
 222   const int q = vp9_dc_quant(qindex, 0);
 223   // TODO(debargha): Adjust the function below
 224   int rdmult = 88 * q * q / 25;
 225   if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
 226     if (cpi->twopass.next_iiratio > 31)
 227       rdmult += (rdmult * rd_iifactor[31]) >> 4;
 228     else
 229       rdmult += (rdmult * rd_iifactor[cpi->twopass.next_iiratio]) >> 4;
 230   }
 231   return rdmult;
 232 }
 233
 234 static int compute_rd_thresh_factor(int qindex) {
 235   // TODO(debargha): Adjust the function below
 236   const int q = (int)(pow(vp9_dc_quant(qindex, 0) / 4.0, RD_THRESH_POW) * 5.12);
 237   return MAX(q, 8);
 238 }
 239
 240 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
 241   cpi->mb.sadperbit16 = sad_per_bit16lut[qindex];
 242   cpi->mb.sadperbit4 = sad_per_bit4lut[qindex];
 243 }
 244
 245 static void set_block_thresholds(const VP9_COMMON *cm, RD_OPT *rd) {
 246   int i, bsize, segment_id;
 247
 248   for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
 249     const int qindex = clamp(vp9_get_qindex(&cm->seg, segment_id,
 250                                             cm->base_qindex) + cm->y_dc_delta_q,
 251                              0, MAXQ);
 252     const int q = compute_rd_thresh_factor(qindex);
 253
 254     for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
 255       // Threshold here seems unnecessarily harsh but fine given actual
 256       // range of values used for cpi->sf.thresh_mult[].
 257       const int t = q * rd_thresh_block_size_factor[bsize];
 258       const int thresh_max = INT_MAX / t;
 259
 260       if (bsize >= BLOCK_8X8) {
 261         for (i = 0; i < MAX_MODES; ++i)
 262           rd->threshes[segment_id][bsize][i] =
 263               rd->thresh_mult[i] < thresh_max
 264                   ? rd->thresh_mult[i] * t / 4
 265                   : INT_MAX;
 266       } else {
 267         for (i = 0; i < MAX_REFS; ++i)
 268           rd->threshes[segment_id][bsize][i] =
 269               rd->thresh_mult_sub8x8[i] < thresh_max
 270                   ? rd->thresh_mult_sub8x8[i] * t / 4
 271                   : INT_MAX;
 272       }
 273     }
 274   }
 275 }
 276
 277 void vp9_initialize_rd_consts(VP9_COMP *cpi) {
 278   VP9_COMMON *const cm = &cpi->common;
 279   MACROBLOCK *const x = &cpi->mb;
 280   RD_OPT *const rd = &cpi->rd;
 281   int i;
 282
 283   vp9_clear_system_state();
 284
 285   rd->RDDIV = RDDIV_BITS;  // in bits (to multiply D by 128)
 286   rd->RDMULT = vp9_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q);
 287
 288   x->errorperbit = rd->RDMULT / RD_MULT_EPB_RATIO;
 289   x->errorperbit += (x->errorperbit == 0);
 290
 291   x->select_txfm_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
 292                          cm->frame_type != KEY_FRAME) ? 0 : 1;
 293
 294   set_block_thresholds(cm, rd);
 295
 296   if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME) {
 297     fill_token_costs(x->token_costs, cm->fc.coef_probs);
 298
 299     for (i = 0; i < PARTITION_CONTEXTS; i++)
 300       vp9_cost_tokens(x->partition_cost[i], get_partition_probs(cm, i),
 301                       vp9_partition_tree);
 302   }
 303
 304   if (!cpi->sf.use_nonrd_pick_mode || (cm->current_video_frame & 0x07) == 1 ||
 305       cm->frame_type == KEY_FRAME) {
 306     fill_mode_costs(cpi);
 307
 308     if (!frame_is_intra_only(cm)) {
 309       vp9_build_nmv_cost_table(x->nmvjointcost,
 310                                cm->allow_high_precision_mv ? x->nmvcost_hp
 311                                                            : x->nmvcost,
 312                                &cm->fc.nmvc, cm->allow_high_precision_mv);
 313
 314       for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
 315         vp9_cost_tokens((int *)cpi->inter_mode_cost[i],
 316                         cm->fc.inter_mode_probs[i], vp9_inter_mode_tree);
 317     }
 318   }
 319 }
 320
 321 static const int MAX_XSQ_Q10 = 245727;
 322
 323 static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
 324   // NOTE: The tables below must be of the same size
 325
 326   // The functions described below are sampled at the four most significant
 327   // bits of x^2 + 8 / 256
 328
 329   // Normalized rate
 330   // This table models the rate for a Laplacian source
 331   // source with given variance when quantized with a uniform quantizer
 332   // with given stepsize. The closed form expression is:
 333   // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
 334   // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
 335   // and H(x) is the binary entropy function.
 336   static const int rate_tab_q10[] = {
 337     65536,  6086,  5574,  5275,  5063,  4899,  4764,  4651,
 338      4553,  4389,  4255,  4142,  4044,  3958,  3881,  3811,
 339      3748,  3635,  3538,  3453,  3376,  3307,  3244,  3186,
 340      3133,  3037,  2952,  2877,  2809,  2747,  2690,  2638,
 341      2589,  2501,  2423,  2353,  2290,  2232,  2179,  2130,
 342      2084,  2001,  1928,  1862,  1802,  1748,  1698,  1651,
 343      1608,  1530,  1460,  1398,  1342,  1290,  1243,  1199,
 344      1159,  1086,  1021,   963,   911,   864,   821,   781,
 345       745,   680,   623,   574,   530,   490,   455,   424,
 346       395,   345,   304,   269,   239,   213,   190,   171,
 347       154,   126,   104,    87,    73,    61,    52,    44,
 348        38,    28,    21,    16,    12,    10,     8,     6,
 349         5,     3,     2,     1,     1,     1,     0,     0,
 350   };
 351   // Normalized distortion
 352   // This table models the normalized distortion for a Laplacian source
 353   // source with given variance when quantized with a uniform quantizer
 354   // with given stepsize. The closed form expression is:
 355   // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
 356   // where x = qpstep / sqrt(variance)
 357   // Note the actual distortion is Dn * variance.
 358   static const int dist_tab_q10[] = {
 359        0,     0,     1,     1,     1,     2,     2,     2,
 360        3,     3,     4,     5,     5,     6,     7,     7,
 361        8,     9,    11,    12,    13,    15,    16,    17,
 362       18,    21,    24,    26,    29,    31,    34,    36,
 363       39,    44,    49,    54,    59,    64,    69,    73,
 364       78,    88,    97,   106,   115,   124,   133,   142,
 365      151,   167,   184,   200,   215,   231,   245,   260,
 366      274,   301,   327,   351,   375,   397,   418,   439,
 367      458,   495,   528,   559,   587,   613,   637,   659,
 368      680,   717,   749,   777,   801,   823,   842,   859,
 369      874,   899,   919,   936,   949,   960,   969,   977,
 370      983,   994,  1001,  1006,  1010,  1013,  1015,  1017,
 371     1018,  1020,  1022,  1022,  1023,  1023,  1023,  1024,
 372   };
 373   static const int xsq_iq_q10[] = {
 374          0,      4,      8,     12,     16,     20,     24,     28,
 375         32,     40,     48,     56,     64,     72,     80,     88,
 376         96,    112,    128,    144,    160,    176,    192,    208,
 377        224,    256,    288,    320,    352,    384,    416,    448,
 378        480,    544,    608,    672,    736,    800,    864,    928,
 379        992,   1120,   1248,   1376,   1504,   1632,   1760,   1888,
 380       2016,   2272,   2528,   2784,   3040,   3296,   3552,   3808,
 381       4064,   4576,   5088,   5600,   6112,   6624,   7136,   7648,
 382       8160,   9184,  10208,  11232,  12256,  13280,  14304,  15328,
 383      16352,  18400,  20448,  22496,  24544,  26592,  28640,  30688,
 384      32736,  36832,  40928,  45024,  49120,  53216,  57312,  61408,
 385      65504,  73696,  81888,  90080,  98272, 106464, 114656, 122848,
 386     131040, 147424, 163808, 180192, 196576, 212960, 229344, 245728,
 387   };
 388   /*
 389   static const int tab_size = sizeof(rate_tab_q10) / sizeof(rate_tab_q10[0]);
 390   assert(sizeof(dist_tab_q10) / sizeof(dist_tab_q10[0]) == tab_size);
 391   assert(sizeof(xsq_iq_q10) / sizeof(xsq_iq_q10[0]) == tab_size);
 392   assert(MAX_XSQ_Q10 + 1 == xsq_iq_q10[tab_size - 1]);
 393   */
 394   int tmp = (xsq_q10 >> 2) + 8;
 395   int k = get_msb(tmp) - 3;
 396   int xq = (k << 3) + ((tmp >> k) & 0x7);
 397   const int one_q10 = 1 << 10;
 398   const int a_q10 = ((xsq_q10 - xsq_iq_q10[xq]) << 10) >> (2 + k);
 399   const int b_q10 = one_q10 - a_q10;
 400   *r_q10 = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
 401   *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
 402 }
 403
 404 void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
 405                                   unsigned int qstep, int *rate,
 406                                   int64_t *dist) {
 407   // This function models the rate and distortion for a Laplacian
 408   // source with given variance when quantized with a uniform quantizer
 409   // with given stepsize. The closed form expressions are in:
 410   // Hang and Chen, "Source Model for transform video coder and its
 411   // application - Part I: Fundamental Theory", IEEE Trans. Circ.
 412   // Sys. for Video Tech., April 1997.
 413   if (var == 0) {
 414     *rate = 0;
 415     *dist = 0;
 416   } else {
 417     int d_q10, r_q10;
 418     const uint64_t xsq_q10_64 =
 419         ((((uint64_t)qstep * qstep * n) << 10) + (var >> 1)) / var;
 420     const int xsq_q10 = xsq_q10_64 > MAX_XSQ_Q10 ?
 421                         MAX_XSQ_Q10 : (int)xsq_q10_64;
 422     model_rd_norm(xsq_q10, &r_q10, &d_q10);
 423     *rate = (n * r_q10 + 2) >> 2;
 424     *dist = (var * (int64_t)d_q10 + 512) >> 10;
 425   }
 426 }
 427
 428 static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
 429                             MACROBLOCK *x, MACROBLOCKD *xd,
 430                             int *out_rate_sum, int64_t *out_dist_sum) {
 431   // Note our transform coeffs are 8 times an orthogonal transform.
 432   // Hence quantizer step is also 8 times. To get effective quantizer
 433   // we need to divide by 8 before sending to modeling function.
 434   int i;
 435   int64_t rate_sum = 0;
 436   int64_t dist_sum = 0;
 437   const int ref = xd->mi[0]->mbmi.ref_frame[0];
 438   unsigned int sse;
 439
 440   for (i = 0; i < MAX_MB_PLANE; ++i) {
 441     struct macroblock_plane *const p = &x->plane[i];
 442     struct macroblockd_plane *const pd = &xd->plane[i];
 443     const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
 444
 445     (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
 446                               pd->dst.buf, pd->dst.stride, &sse);
 447
 448     if (i == 0)
 449       x->pred_sse[ref] = sse;
 450
 451     // Fast approximate the modelling function.
 452     if (cpi->oxcf.speed > 4) {
 453       int64_t rate;
 454       int64_t dist;
 455       int64_t square_error = sse;
 456       int quantizer = (pd->dequant[1] >> 3);
 457
 458       if (quantizer < 120)
 459         rate = (square_error * (280 - quantizer)) >> 8;
 460       else
 461         rate = 0;
 462       dist = (square_error * quantizer) >> 8;
 463       rate_sum += rate;
 464       dist_sum += dist;
 465     } else {
 466       int rate;
 467       int64_t dist;
 468       vp9_model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs],
 469                                    pd->dequant[1] >> 3, &rate, &dist);
 470       rate_sum += rate;
 471       dist_sum += dist;
 472     }
 473   }
 474
 475   *out_rate_sum = (int)rate_sum;
 476   *out_dist_sum = dist_sum << 4;
 477 }
 478
 479 static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE bsize,
 480                                  TX_SIZE tx_size,
 481                                  MACROBLOCK *x, MACROBLOCKD *xd,
 482                                  int *out_rate_sum, int64_t *out_dist_sum,
 483                                  int *out_skip) {
 484   int j, k;
 485   BLOCK_SIZE bs;
 486   const struct macroblock_plane *const p = &x->plane[0];
 487   const struct macroblockd_plane *const pd = &xd->plane[0];
 488   const int width = 4 * num_4x4_blocks_wide_lookup[bsize];
 489   const int height = 4 * num_4x4_blocks_high_lookup[bsize];
 490   int rate_sum = 0;
 491   int64_t dist_sum = 0;
 492   const int t = 4 << tx_size;
 493
 494   if (tx_size == TX_4X4) {
 495     bs = BLOCK_4X4;
 496   } else if (tx_size == TX_8X8) {
 497     bs = BLOCK_8X8;
 498   } else if (tx_size == TX_16X16) {
 499     bs = BLOCK_16X16;
 500   } else if (tx_size == TX_32X32) {
 501     bs = BLOCK_32X32;
 502   } else {
 503     assert(0);
 504   }
 505
 506   *out_skip = 1;
 507   for (j = 0; j < height; j += t) {
 508     for (k = 0; k < width; k += t) {
 509       int rate;
 510       int64_t dist;
 511       unsigned int sse;
 512       cpi->fn_ptr[bs].vf(&p->src.buf[j * p->src.stride + k], p->src.stride,
 513                          &pd->dst.buf[j * pd->dst.stride + k], pd->dst.stride,
 514                          &sse);
 515       // sse works better than var, since there is no dc prediction used
 516       vp9_model_rd_from_var_lapndz(sse, t * t, pd->dequant[1] >> 3,
 517                                    &rate, &dist);
 518       rate_sum += rate;
 519       dist_sum += dist;
 520       *out_skip &= (rate < 1024);
 521     }
 522   }
 523
 524   *out_rate_sum = rate_sum;
 525   *out_dist_sum = dist_sum << 4;
 526 }
 527
 528 int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff,
 529                           intptr_t block_size, int64_t *ssz) {
 530   int i;
 531   int64_t error = 0, sqcoeff = 0;
 532
 533   for (i = 0; i < block_size; i++) {
 534     const int diff = coeff[i] - dqcoeff[i];
 535     error +=  diff * diff;
 536     sqcoeff += coeff[i] * coeff[i];
 537   }
 538
 539   *ssz = sqcoeff;
 540   return error;
 541 }
 542
 543 /* The trailing '0' is a terminator which is used inside cost_coeffs() to
 544  * decide whether to include cost of a trailing EOB node or not (i.e. we
 545  * can skip this if the last coefficient in this transform block, e.g. the
 546  * 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block,
 547  * were non-zero). */
 548 static const int16_t band_counts[TX_SIZES][8] = {
 549   { 1, 2, 3, 4,  3,   16 - 13, 0 },
 550   { 1, 2, 3, 4, 11,   64 - 21, 0 },
 551   { 1, 2, 3, 4, 11,  256 - 21, 0 },
 552   { 1, 2, 3, 4, 11, 1024 - 21, 0 },
 553 };
 554 static INLINE int cost_coeffs(MACROBLOCK *x,
 555                               int plane, int block,
 556                               ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
 557                               TX_SIZE tx_size,
 558                               const int16_t *scan, const int16_t *nb,
 559                               int use_fast_coef_costing) {
 560   MACROBLOCKD *const xd = &x->e_mbd;
 561   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
 562   const struct macroblock_plane *p = &x->plane[plane];
 563   const struct macroblockd_plane *pd = &xd->plane[plane];
 564   const PLANE_TYPE type = pd->plane_type;
 565   const int16_t *band_count = &band_counts[tx_size][1];
 566   const int eob = p->eobs[block];
 567   const int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
 568   unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
 569                    x->token_costs[tx_size][type][is_inter_block(mbmi)];
 570   uint8_t token_cache[32 * 32];
 571   int pt = combine_entropy_contexts(*A, *L);
 572   int c, cost;
 573   // Check for consistency of tx_size with mode info
 574   assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size
 575                               : get_uv_tx_size(mbmi) == tx_size);
 576
 577   if (eob == 0) {
 578     // single eob token
 579     cost = token_costs[0][0][pt][EOB_TOKEN];
 580     c = 0;
 581   } else {
 582     int band_left = *band_count++;
 583
 584     // dc token
 585     int v = qcoeff[0];
 586     int prev_t = vp9_dct_value_tokens_ptr[v].token;
 587     cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
 588     token_cache[0] = vp9_pt_energy_class[prev_t];
 589     ++token_costs;
 590
 591     // ac tokens
 592     for (c = 1; c < eob; c++) {
 593       const int rc = scan[c];
 594       int t;
 595
 596       v = qcoeff[rc];
 597       t = vp9_dct_value_tokens_ptr[v].token;
 598       if (use_fast_coef_costing) {
 599         cost += (*token_costs)[!prev_t][!prev_t][t] + vp9_dct_value_cost_ptr[v];
 600       } else {
 601         pt = get_coef_context(nb, token_cache, c);
 602         cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v];
 603         token_cache[rc] = vp9_pt_energy_class[t];
 604       }
 605       prev_t = t;
 606       if (!--band_left) {
 607         band_left = *band_count++;
 608         ++token_costs;
 609       }
 610     }
 611
 612     // eob token
 613     if (band_left) {
 614       if (use_fast_coef_costing) {
 615         cost += (*token_costs)[0][!prev_t][EOB_TOKEN];
 616       } else {
 617         pt = get_coef_context(nb, token_cache, c);
 618         cost += (*token_costs)[0][pt][EOB_TOKEN];
 619       }
 620     }
 621   }
 622
 623   // is eob first coefficient;
 624   *A = *L = (c > 0);
 625
 626   return cost;
 627 }
 628 static void dist_block(int plane, int block, TX_SIZE tx_size,
 629                        struct rdcost_block_args* args) {
 630   const int ss_txfrm_size = tx_size << 1;
 631   MACROBLOCK* const x = args->x;
 632   MACROBLOCKD* const xd = &x->e_mbd;
 633   const struct macroblock_plane *const p = &x->plane[plane];
 634   const struct macroblockd_plane *const pd = &xd->plane[plane];
 635   int64_t this_sse;
 636   int shift = tx_size == TX_32X32 ? 0 : 2;
 637   int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
 638   int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
 639   args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
 640                                &this_sse) >> shift;
 641   args->sse  = this_sse >> shift;
 642
 643   if (x->skip_encode && !is_inter_block(&xd->mi[0]->mbmi)) {
 644     // TODO(jingning): tune the model to better capture the distortion.
 645     int64_t p = (pd->dequant[1] * pd->dequant[1] *
 646                     (1 << ss_txfrm_size)) >> (shift + 2);
 647     args->dist += (p >> 4);
 648     args->sse  += p;
 649   }
 650 }
 651
 652 static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
 653                        TX_SIZE tx_size, struct rdcost_block_args* args) {
 654   int x_idx, y_idx;
 655   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x_idx, &y_idx);
 656
 657   args->rate = cost_coeffs(args->x, plane, block, args->t_above + x_idx,
 658                            args->t_left + y_idx, tx_size,
 659                            args->so->scan, args->so->neighbors,
 660                            args->use_fast_coef_costing);
 661 }
 662
 663 static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
 664                           TX_SIZE tx_size, void *arg) {
 665   struct rdcost_block_args *args = arg;
 666   MACROBLOCK *const x = args->x;
 667   MACROBLOCKD *const xd = &x->e_mbd;
 668   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
 669   int64_t rd1, rd2, rd;
 670
 671   if (args->skip)
 672     return;
 673
 674   if (!is_inter_block(mbmi))
 675     vp9_encode_block_intra(x, plane, block, plane_bsize, tx_size, &mbmi->skip);
 676   else
 677     vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
 678
 679   dist_block(plane, block, tx_size, args);
 680   rate_block(plane, block, plane_bsize, tx_size, args);
 681   rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist);
 682   rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse);
 683
 684   // TODO(jingning): temporarily enabled only for luma component
 685   rd = MIN(rd1, rd2);
 686   if (plane == 0)
 687     x->zcoeff_blk[tx_size][block] = !x->plane[plane].eobs[block] ||
 688                                     (rd1 > rd2 && !xd->lossless);
 689
 690   args->this_rate += args->rate;
 691   args->this_dist += args->dist;
 692   args->this_sse  += args->sse;
 693   args->this_rd += rd;
 694
 695   if (args->this_rd > args->best_rd) {
 696     args->skip = 1;
 697     return;
 698   }
 699 }
 700
 701 void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
 702                               const struct macroblockd_plane *pd,
 703                               ENTROPY_CONTEXT t_above[16],
 704                               ENTROPY_CONTEXT t_left[16]) {
 705   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
 706   const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
 707   const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
 708   const ENTROPY_CONTEXT *const above = pd->above_context;
 709   const ENTROPY_CONTEXT *const left = pd->left_context;
 710
 711   int i;
 712   switch (tx_size) {
 713     case TX_4X4:
 714       vpx_memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
 715       vpx_memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
 716       break;
 717     case TX_8X8:
 718       for (i = 0; i < num_4x4_w; i += 2)
 719         t_above[i] = !!*(const uint16_t *)&above[i];
 720       for (i = 0; i < num_4x4_h; i += 2)
 721         t_left[i] = !!*(const uint16_t *)&left[i];
 722       break;
 723     case TX_16X16:
 724       for (i = 0; i < num_4x4_w; i += 4)
 725         t_above[i] = !!*(const uint32_t *)&above[i];
 726       for (i = 0; i < num_4x4_h; i += 4)
 727         t_left[i] = !!*(const uint32_t *)&left[i];
 728       break;
 729     case TX_32X32:
 730       for (i = 0; i < num_4x4_w; i += 8)
 731         t_above[i] = !!*(const uint64_t *)&above[i];
 732       for (i = 0; i < num_4x4_h; i += 8)
 733         t_left[i] = !!*(const uint64_t *)&left[i];
 734       break;
 735     default:
 736       assert(0 && "Invalid transform size.");
 737   }
 738 }
 739
 740 static void txfm_rd_in_plane(MACROBLOCK *x,
 741                              int *rate, int64_t *distortion,
 742                              int *skippable, int64_t *sse,
 743                              int64_t ref_best_rd, int plane,
 744                              BLOCK_SIZE bsize, TX_SIZE tx_size,
 745                              int use_fast_coef_casting) {
 746   MACROBLOCKD *const xd = &x->e_mbd;
 747   const struct macroblockd_plane *const pd = &xd->plane[plane];
 748   struct rdcost_block_args args = { 0 };
 749   args.x = x;
 750   args.best_rd = ref_best_rd;
 751   args.use_fast_coef_costing = use_fast_coef_casting;
 752
 753   if (plane == 0)
 754     xd->mi[0]->mbmi.tx_size = tx_size;
 755
 756   vp9_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
 757
 758   args.so = get_scan(xd, tx_size, pd->plane_type, 0);
 759
 760   vp9_foreach_transformed_block_in_plane(xd, bsize, plane,
 761                                          block_rd_txfm, &args);
 762   if (args.skip) {
 763     *rate       = INT_MAX;
 764     *distortion = INT64_MAX;
 765     *sse        = INT64_MAX;
 766     *skippable  = 0;
 767   } else {
 768     *distortion = args.this_dist;
 769     *rate       = args.this_rate;
 770     *sse        = args.this_sse;
 771     *skippable  = vp9_is_skippable_in_plane(x, bsize, plane);
 772   }
 773 }
 774
 775 static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x,
 776                                      int *rate, int64_t *distortion,
 777                                      int *skip, int64_t *sse,
 778                                      int64_t ref_best_rd,
 779                                      BLOCK_SIZE bs) {
 780   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
 781   VP9_COMMON *const cm = &cpi->common;
 782   const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
 783   MACROBLOCKD *const xd = &x->e_mbd;
 784   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
 785
 786   mbmi->tx_size = MIN(max_tx_size, largest_tx_size);
 787
 788   txfm_rd_in_plane(x, rate, distortion, skip,
 789                    &sse[mbmi->tx_size], ref_best_rd, 0, bs,
 790                    mbmi->tx_size, cpi->sf.use_fast_coef_costing);
 791   cpi->tx_stepdown_count[0]++;
 792 }
 793
 794 static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
 795                                      int (*r)[2], int *rate,
 796                                      int64_t *d, int64_t *distortion,
 797                                      int *s, int *skip,
 798                                      int64_t tx_cache[TX_MODES],
 799                                      BLOCK_SIZE bs) {
 800   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
 801   VP9_COMMON *const cm = &cpi->common;
 802   MACROBLOCKD *const xd = &x->e_mbd;
 803   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
 804   vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
 805   int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX},
 806                              {INT64_MAX, INT64_MAX},
 807                              {INT64_MAX, INT64_MAX},
 808                              {INT64_MAX, INT64_MAX}};
 809   int n, m;
 810   int s0, s1;
 811   const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
 812   int64_t best_rd = INT64_MAX;
 813   TX_SIZE best_tx = TX_4X4;
 814
 815   const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
 816   assert(skip_prob > 0);
 817   s0 = vp9_cost_bit(skip_prob, 0);
 818   s1 = vp9_cost_bit(skip_prob, 1);
 819
 820   for (n = TX_4X4; n <= max_tx_size; n++) {
 821     r[n][1] = r[n][0];
 822     if (r[n][0] < INT_MAX) {
 823       for (m = 0; m <= n - (n == max_tx_size); m++) {
 824         if (m == n)
 825           r[n][1] += vp9_cost_zero(tx_probs[m]);
 826         else
 827           r[n][1] += vp9_cost_one(tx_probs[m]);
 828       }
 829     }
 830     if (d[n] == INT64_MAX) {
 831       rd[n][0] = rd[n][1] = INT64_MAX;
 832     } else if (s[n]) {
 833       rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
 834     } else {
 835       rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
 836       rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
 837     }
 838
 839     if (rd[n][1] < best_rd) {
 840       best_tx = n;
 841       best_rd = rd[n][1];
 842     }
 843   }
 844   mbmi->tx_size = cm->tx_mode == TX_MODE_SELECT ?
 845                       best_tx : MIN(max_tx_size, max_mode_tx_size);
 846
 847
 848   *distortion = d[mbmi->tx_size];
 849   *rate       = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT];
 850   *skip       = s[mbmi->tx_size];
 851
 852   tx_cache[ONLY_4X4] = rd[TX_4X4][0];
 853   tx_cache[ALLOW_8X8] = rd[TX_8X8][0];
 854   tx_cache[ALLOW_16X16] = rd[MIN(max_tx_size, TX_16X16)][0];
 855   tx_cache[ALLOW_32X32] = rd[MIN(max_tx_size, TX_32X32)][0];
 856
 857   if (max_tx_size == TX_32X32 && best_tx == TX_32X32) {
 858     tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
 859     cpi->tx_stepdown_count[0]++;
 860   } else if (max_tx_size >= TX_16X16 && best_tx == TX_16X16) {
 861     tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
 862     cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
 863   } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {
 864     tx_cache[TX_MODE_SELECT] = rd[TX_8X8][1];
 865     cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
 866   } else {
 867     tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1];
 868     cpi->tx_stepdown_count[max_tx_size - TX_4X4]++;
 869   }
 870 }
 871
 872 static int64_t scaled_rd_cost(int rdmult, int rddiv,
 873                               int rate, int64_t dist, double scale) {
 874   return (int64_t) (RDCOST(rdmult, rddiv, rate, dist) * scale);
 875 }
 876
 877 static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
 878                                           int (*r)[2], int *rate,
 879                                           int64_t *d, int64_t *distortion,
 880                                           int *s, int *skip, int64_t *sse,
 881                                           int64_t ref_best_rd,
 882                                           BLOCK_SIZE bs) {
 883   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
 884   VP9_COMMON *const cm = &cpi->common;
 885   MACROBLOCKD *const xd = &x->e_mbd;
 886   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
 887   vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
 888   int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX},
 889                              {INT64_MAX, INT64_MAX},
 890                              {INT64_MAX, INT64_MAX},
 891                              {INT64_MAX, INT64_MAX}};
 892   int n, m;
 893   int s0, s1;
 894   double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00};
 895   const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
 896   int64_t best_rd = INT64_MAX;
 897   TX_SIZE best_tx = TX_4X4;
 898
 899   const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
 900   assert(skip_prob > 0);
 901   s0 = vp9_cost_bit(skip_prob, 0);
 902   s1 = vp9_cost_bit(skip_prob, 1);
 903
 904   for (n = TX_4X4; n <= max_tx_size; n++) {
 905     double scale = scale_rd[n];
 906     r[n][1] = r[n][0];
 907     for (m = 0; m <= n - (n == max_tx_size); m++) {
 908       if (m == n)
 909         r[n][1] += vp9_cost_zero(tx_probs[m]);
 910       else
 911         r[n][1] += vp9_cost_one(tx_probs[m]);
 912     }
 913     if (s[n]) {
 914       rd[n][0] = rd[n][1] = scaled_rd_cost(x->rdmult, x->rddiv, s1, d[n],
 915                                            scale);
 916     } else {
 917       rd[n][0] = scaled_rd_cost(x->rdmult, x->rddiv, r[n][0] + s0, d[n],
 918                                 scale);
 919       rd[n][1] = scaled_rd_cost(x->rdmult, x->rddiv, r[n][1] + s0, d[n],
 920                                 scale);
 921     }
 922     if (rd[n][1] < best_rd) {
 923       best_rd = rd[n][1];
 924       best_tx = n;
 925     }
 926   }
 927
 928   mbmi->tx_size = cm->tx_mode == TX_MODE_SELECT ?
 929                       best_tx : MIN(max_tx_size, max_mode_tx_size);
 930
 931   // Actually encode using the chosen mode if a model was used, but do not
 932   // update the r, d costs
 933   txfm_rd_in_plane(x, rate, distortion, skip,
 934                    &sse[mbmi->tx_size], ref_best_rd, 0, bs, mbmi->tx_size,
 935                    cpi->sf.use_fast_coef_costing);
 936
 937   if (max_tx_size == TX_32X32 && best_tx == TX_32X32) {
 938     cpi->tx_stepdown_count[0]++;
 939   } else if (max_tx_size >= TX_16X16 &&  best_tx == TX_16X16) {
 940     cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
 941   } else if (rd[TX_8X8][1] <= rd[TX_4X4][1]) {
 942     cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
 943   } else {
 944     cpi->tx_stepdown_count[max_tx_size - TX_4X4]++;
 945   }
 946 }
 947
 948 static void inter_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
 949                                   int64_t *distortion, int *skip,
 950                                   int64_t *psse, BLOCK_SIZE bs,
 951                                   int64_t txfm_cache[TX_MODES],
 952                                   int64_t ref_best_rd) {
 953   int r[TX_SIZES][2], s[TX_SIZES];
 954   int64_t d[TX_SIZES], sse[TX_SIZES];
 955   MACROBLOCKD *xd = &x->e_mbd;
 956   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
 957   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
 958   TX_SIZE tx_size;
 959
 960   assert(bs == mbmi->sb_type);
 961
 962   vp9_subtract_plane(x, bs, 0);
 963
 964   if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
 965     vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
 966     choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
 967                              ref_best_rd, bs);
 968     if (psse)
 969       *psse = sse[mbmi->tx_size];
 970     return;
 971   }
 972
 973   if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER) {
 974     for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
 975       model_rd_for_sb_y_tx(cpi, bs, tx_size, x, xd,
 976                            &r[tx_size][0], &d[tx_size], &s[tx_size]);
 977     choose_txfm_size_from_modelrd(cpi, x, r, rate, d, distortion, s,
 978                                   skip, sse, ref_best_rd, bs);
 979   } else {
 980     for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
 981       txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size],
 982                        &s[tx_size], &sse[tx_size],
 983                        ref_best_rd, 0, bs, tx_size,
 984                        cpi->sf.use_fast_coef_costing);
 985     choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
 986                              skip, txfm_cache, bs);
 987   }
 988   if (psse)
 989     *psse = sse[mbmi->tx_size];
 990 }
 991
 992 static void intra_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
 993                                   int64_t *distortion, int *skip,
 994                                   int64_t *psse, BLOCK_SIZE bs,
 995                                   int64_t txfm_cache[TX_MODES],
 996                                   int64_t ref_best_rd) {
 997   int64_t sse[TX_SIZES];
 998   MACROBLOCKD *xd = &x->e_mbd;
 999   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
1000
1001   assert(bs == mbmi->sb_type);
1002   if (cpi->sf.tx_size_search_method != USE_FULL_RD) {
1003     vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
1004     choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
1005                              ref_best_rd, bs);
1006   } else {
1007     int r[TX_SIZES][2], s[TX_SIZES];
1008     int64_t d[TX_SIZES];
1009     TX_SIZE tx_size;
1010     for (tx_size = TX_4X4; tx_size <= max_txsize_lookup[bs]; ++tx_size)
1011       txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size],
1012                        &s[tx_size], &sse[tx_size],
1013                        ref_best_rd, 0, bs, tx_size,
1014                        cpi->sf.use_fast_coef_costing);
1015     choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
1016                              skip, txfm_cache, bs);
1017   }
1018   if (psse)
1019     *psse = sse[mbmi->tx_size];
1020 }
1021
1022
1023 static int conditional_skipintra(PREDICTION_MODE mode,
1024                                  PREDICTION_MODE best_intra_mode) {
1025   if (mode == D117_PRED &&
1026       best_intra_mode != V_PRED &&
1027       best_intra_mode != D135_PRED)
1028     return 1;
1029   if (mode == D63_PRED &&
1030       best_intra_mode != V_PRED &&
1031       best_intra_mode != D45_PRED)
1032     return 1;
1033   if (mode == D207_PRED &&
1034       best_intra_mode != H_PRED &&
1035       best_intra_mode != D45_PRED)
1036     return 1;
1037   if (mode == D153_PRED &&
1038       best_intra_mode != H_PRED &&
1039       best_intra_mode != D135_PRED)
1040     return 1;
1041   return 0;
1042 }
1043
1044 static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
1045                                      PREDICTION_MODE *best_mode,
1046                                      const int *bmode_costs,
1047                                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
1048                                      int *bestrate, int *bestratey,
1049                                      int64_t *bestdistortion,
1050                                      BLOCK_SIZE bsize, int64_t rd_thresh) {
1051   PREDICTION_MODE mode;
1052   MACROBLOCKD *const xd = &x->e_mbd;
1053   int64_t best_rd = rd_thresh;
1054
1055   struct macroblock_plane *p = &x->plane[0];
1056   struct macroblockd_plane *pd = &xd->plane[0];
1057   const int src_stride = p->src.stride;
1058   const int dst_stride = pd->dst.stride;
1059   const uint8_t *src_init = &p->src.buf[raster_block_offset(BLOCK_8X8, ib,
1060                                                             src_stride)];
1061   uint8_t *dst_init = &pd->dst.buf[raster_block_offset(BLOCK_8X8, ib,
1062                                                        dst_stride)];
1063   ENTROPY_CONTEXT ta[2], tempa[2];
1064   ENTROPY_CONTEXT tl[2], templ[2];
1065
1066   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
1067   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
1068   int idx, idy;
1069   uint8_t best_dst[8 * 8];
1070
1071   assert(ib < 4);
1072
1073   vpx_memcpy(ta, a, sizeof(ta));
1074   vpx_memcpy(tl, l, sizeof(tl));
1075   xd->mi[0]->mbmi.tx_size = TX_4X4;
1076
1077   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
1078     int64_t this_rd;
1079     int ratey = 0;
1080     int64_t distortion = 0;
1081     int rate = bmode_costs[mode];
1082
1083     if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode)))
1084       continue;
1085
1086     // Only do the oblique modes if the best so far is
1087     // one of the neighboring directional modes
1088     if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
1089       if (conditional_skipintra(mode, *best_mode))
1090           continue;
1091     }
1092
1093     vpx_memcpy(tempa, ta, sizeof(ta));
1094     vpx_memcpy(templ, tl, sizeof(tl));
1095
1096     for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
1097       for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
1098         const int block = ib + idy * 2 + idx;
1099         const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
1100         uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
1101         int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block,
1102                                                             p->src_diff);
1103         int16_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
1104         xd->mi[0]->bmi[block].as_mode = mode;
1105         vp9_predict_intra_block(xd, block, 1,
1106                                 TX_4X4, mode,
1107                                 x->skip_encode ? src : dst,
1108                                 x->skip_encode ? src_stride : dst_stride,
1109                                 dst, dst_stride, idx, idy, 0);
1110         vp9_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
1111
1112         if (xd->lossless) {
1113           const scan_order *so = &vp9_default_scan_orders[TX_4X4];
1114           vp9_fwht4x4(src_diff, coeff, 8);
1115           vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
1116           ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
1117                                so->scan, so->neighbors,
1118                                cpi->sf.use_fast_coef_costing);
1119           if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
1120             goto next;
1121           vp9_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, dst_stride,
1122                           p->eobs[block]);
1123         } else {
1124           int64_t unused;
1125           const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
1126           const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
1127           vp9_fht4x4(src_diff, coeff, 8, tx_type);
1128           vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
1129           ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
1130                              so->scan, so->neighbors,
1131                              cpi->sf.use_fast_coef_costing);
1132           distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
1133                                         16, &unused) >> 2;
1134           if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
1135             goto next;
1136           vp9_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block),
1137                          dst, dst_stride, p->eobs[block]);
1138         }
1139       }
1140     }
1141
1142     rate += ratey;
1143     this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
1144
1145     if (this_rd < best_rd) {
1146       *bestrate = rate;
1147       *bestratey = ratey;
1148       *bestdistortion = distortion;
1149       best_rd = this_rd;
1150       *best_mode = mode;
1151       vpx_memcpy(a, tempa, sizeof(tempa));
1152       vpx_memcpy(l, templ, sizeof(templ));
1153       for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
1154         vpx_memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
1155                    num_4x4_blocks_wide * 4);
1156     }
1157   next:
1158     {}
1159   }
1160
1161   if (best_rd >= rd_thresh || x->skip_encode)
1162     return best_rd;
1163
1164   for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
1165     vpx_memcpy(dst_init + idy * dst_stride, best_dst + idy * 8,
1166                num_4x4_blocks_wide * 4);
1167
1168   return best_rd;
1169 }
1170
1171 static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP *cpi, MACROBLOCK *mb,
1172                                             int *rate, int *rate_y,
1173                                             int64_t *distortion,
1174                                             int64_t best_rd) {
1175   int i, j;
1176   const MACROBLOCKD *const xd = &mb->e_mbd;
1177   MODE_INFO *const mic = xd->mi[0];
1178   const MODE_INFO *above_mi = xd->mi[-xd->mi_stride];
1179   const MODE_INFO *left_mi = xd->left_available ? xd->mi[-1] : NULL;
1180   const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
1181   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
1182   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
1183   int idx, idy;
1184   int cost = 0;
1185   int64_t total_distortion = 0;
1186   int tot_rate_y = 0;
1187   int64_t total_rd = 0;
1188   ENTROPY_CONTEXT t_above[4], t_left[4];
1189   const int *bmode_costs = cpi->mbmode_cost;
1190
1191   vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
1192   vpx_memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
1193
1194   // Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block.
1195   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
1196     for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
1197       PREDICTION_MODE best_mode = DC_PRED;
1198       int r = INT_MAX, ry = INT_MAX;
1199       int64_t d = INT64_MAX, this_rd = INT64_MAX;
1200       i = idy * 2 + idx;
1201       if (cpi->common.frame_type == KEY_FRAME) {
1202         const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, i);
1203         const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, i);
1204
1205         bmode_costs  = cpi->y_mode_costs[A][L];
1206       }
1207
1208       this_rd = rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
1209                                       t_above + idx, t_left + idy, &r, &ry, &d,
1210                                       bsize, best_rd - total_rd);
1211       if (this_rd >= best_rd - total_rd)
1212         return INT64_MAX;
1213
1214       total_rd += this_rd;
1215       cost += r;
1216       total_distortion += d;
1217       tot_rate_y += ry;
1218
1219       mic->bmi[i].as_mode = best_mode;
1220       for (j = 1; j < num_4x4_blocks_high; ++j)
1221         mic->bmi[i + j * 2].as_mode = best_mode;
1222       for (j = 1; j < num_4x4_blocks_wide; ++j)
1223         mic->bmi[i + j].as_mode = best_mode;
1224
1225       if (total_rd >= best_rd)
1226         return INT64_MAX;
1227     }
1228   }
1229
1230   *rate = cost;
1231   *rate_y = tot_rate_y;
1232   *distortion = total_distortion;
1233   mic->mbmi.mode = mic->bmi[3].as_mode;
1234
1235   return RDCOST(mb->rdmult, mb->rddiv, cost, total_distortion);
1236 }
1237
1238 static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
1239                                       int *rate, int *rate_tokenonly,
1240                                       int64_t *distortion, int *skippable,
1241                                       BLOCK_SIZE bsize,
1242                                       int64_t tx_cache[TX_MODES],
1243                                       int64_t best_rd) {
1244   PREDICTION_MODE mode;
1245   PREDICTION_MODE mode_selected = DC_PRED;
1246   MACROBLOCKD *const xd = &x->e_mbd;
1247   MODE_INFO *const mic = xd->mi[0];
1248   int this_rate, this_rate_tokenonly, s;
1249   int64_t this_distortion, this_rd;
1250   TX_SIZE best_tx = TX_4X4;
1251   int i;
1252   int *bmode_costs = cpi->mbmode_cost;
1253
1254   if (cpi->sf.tx_size_search_method == USE_FULL_RD)
1255     for (i = 0; i < TX_MODES; i++)
1256       tx_cache[i] = INT64_MAX;
1257
1258   /* Y Search for intra prediction mode */
1259   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
1260     int64_t local_tx_cache[TX_MODES];
1261     MODE_INFO *above_mi = xd->mi[-xd->mi_stride];
1262     MODE_INFO *left_mi = xd->left_available ? xd->mi[-1] : NULL;
1263
1264     if (!(cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]] & (1 << mode)))
1265       continue;
1266
1267     if (cpi->common.frame_type == KEY_FRAME) {
1268       const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
1269       const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
1270
1271       bmode_costs = cpi->y_mode_costs[A][L];
1272     }
1273     mic->mbmi.mode = mode;
1274
1275     intra_super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
1276         &s, NULL, bsize, local_tx_cache, best_rd);
1277
1278     if (this_rate_tokenonly == INT_MAX)
1279       continue;
1280
1281     this_rate = this_rate_tokenonly + bmode_costs[mode];
1282     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
1283
1284     if (this_rd < best_rd) {
1285       mode_selected   = mode;
1286       best_rd         = this_rd;
1287       best_tx         = mic->mbmi.tx_size;
1288       *rate           = this_rate;
1289       *rate_tokenonly = this_rate_tokenonly;
1290       *distortion     = this_distortion;
1291       *skippable      = s;
1292     }
1293
1294     if (cpi->sf.tx_size_search_method == USE_FULL_RD && this_rd < INT64_MAX) {
1295       for (i = 0; i < TX_MODES && local_tx_cache[i] < INT64_MAX; i++) {
1296         const int64_t adj_rd = this_rd + local_tx_cache[i] -
1297             local_tx_cache[cpi->common.tx_mode];
1298         if (adj_rd < tx_cache[i]) {
1299           tx_cache[i] = adj_rd;
1300         }
1301       }
1302     }
1303   }
1304
1305   mic->mbmi.mode = mode_selected;
1306   mic->mbmi.tx_size = best_tx;
1307
1308   return best_rd;
1309 }
1310
1311 static void super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x,
1312                              int *rate, int64_t *distortion, int *skippable,
1313                              int64_t *sse, BLOCK_SIZE bsize,
1314                              int64_t ref_best_rd) {
1315   MACROBLOCKD *const xd = &x->e_mbd;
1316   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
1317   TX_SIZE uv_txfm_size = get_uv_tx_size(mbmi);
1318   int plane;
1319   int pnrate = 0, pnskip = 1;
1320   int64_t pndist = 0, pnsse = 0;
1321
1322   if (ref_best_rd < 0)
1323     goto term;
1324
1325   if (is_inter_block(mbmi)) {
1326     int plane;
1327     for (plane = 1; plane < MAX_MB_PLANE; ++plane)
1328       vp9_subtract_plane(x, bsize, plane);
1329   }
1330
1331   *rate = 0;
1332   *distortion = 0;
1333   *sse = 0;
1334   *skippable = 1;
1335
1336   for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
1337     txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse,
1338                      ref_best_rd, plane, bsize, uv_txfm_size,
1339                      cpi->sf.use_fast_coef_costing);
1340     if (pnrate == INT_MAX)
1341       goto term;
1342     *rate += pnrate;
1343     *distortion += pndist;
1344     *sse += pnsse;
1345     *skippable &= pnskip;
1346   }
1347   return;
1348
1349   term:
1350   *rate = INT_MAX;
1351   *distortion = INT64_MAX;
1352   *sse = INT64_MAX;
1353   *skippable = 0;
1354   return;
1355 }
1356
1357 static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
1358                                        PICK_MODE_CONTEXT *ctx,
1359                                        int *rate, int *rate_tokenonly,
1360                                        int64_t *distortion, int *skippable,
1361                                        BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
1362   MACROBLOCKD *xd = &x->e_mbd;
1363   PREDICTION_MODE mode;
1364   PREDICTION_MODE mode_selected = DC_PRED;
1365   int64_t best_rd = INT64_MAX, this_rd;
1366   int this_rate_tokenonly, this_rate, s;
1367   int64_t this_distortion, this_sse;
1368
1369   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
1370     if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode)))
1371       continue;
1372
1373     xd->mi[0]->mbmi.uv_mode = mode;
1374
1375     super_block_uvrd(cpi, x, &this_rate_tokenonly,
1376                      &this_distortion, &s, &this_sse, bsize, best_rd);
1377     if (this_rate_tokenonly == INT_MAX)
1378       continue;
1379     this_rate = this_rate_tokenonly +
1380                 cpi->intra_uv_mode_cost[cpi->common.frame_type][mode];
1381     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
1382
1383     if (this_rd < best_rd) {
1384       mode_selected   = mode;
1385       best_rd         = this_rd;
1386       *rate           = this_rate;
1387       *rate_tokenonly = this_rate_tokenonly;
1388       *distortion     = this_distortion;
1389       *skippable      = s;
1390       if (!x->select_txfm_size) {
1391         int i;
1392         struct macroblock_plane *const p = x->plane;
1393         struct macroblockd_plane *const pd = xd->plane;
1394         for (i = 1; i < MAX_MB_PLANE; ++i) {
1395           p[i].coeff    = ctx->coeff_pbuf[i][2];
1396           p[i].qcoeff   = ctx->qcoeff_pbuf[i][2];
1397           pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
1398           p[i].eobs    = ctx->eobs_pbuf[i][2];
1399
1400           ctx->coeff_pbuf[i][2]   = ctx->coeff_pbuf[i][0];
1401           ctx->qcoeff_pbuf[i][2]  = ctx->qcoeff_pbuf[i][0];
1402           ctx->dqcoeff_pbuf[i][2] = ctx->dqcoeff_pbuf[i][0];
1403           ctx->eobs_pbuf[i][2]    = ctx->eobs_pbuf[i][0];
1404
1405           ctx->coeff_pbuf[i][0]   = p[i].coeff;
1406           ctx->qcoeff_pbuf[i][0]  = p[i].qcoeff;
1407           ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff;
1408           ctx->eobs_pbuf[i][0]    = p[i].eobs;
1409         }
1410       }
1411     }
1412   }
1413
1414   xd->mi[0]->mbmi.uv_mode = mode_selected;
1415   return best_rd;
1416 }
1417
1418 static int64_t rd_sbuv_dcpred(const VP9_COMP *cpi, MACROBLOCK *x,
1419                               int *rate, int *rate_tokenonly,
1420                               int64_t *distortion, int *skippable,
1421                               BLOCK_SIZE bsize) {
1422   const VP9_COMMON *cm = &cpi->common;
1423   int64_t unused;
1424
1425   x->e_mbd.mi[0]->mbmi.uv_mode = DC_PRED;
1426   super_block_uvrd(cpi, x, rate_tokenonly, distortion,
1427                    skippable, &unused, bsize, INT64_MAX);
1428   *rate = *rate_tokenonly + cpi->intra_uv_mode_cost[cm->frame_type][DC_PRED];
1429   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
1430 }
1431
1432 static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
1433                                  BLOCK_SIZE bsize, TX_SIZE max_tx_size,
1434                                  int *rate_uv, int *rate_uv_tokenonly,
1435                                  int64_t *dist_uv, int *skip_uv,
1436                                  PREDICTION_MODE *mode_uv) {
1437   MACROBLOCK *const x = &cpi->mb;
1438
1439   // Use an estimated rd for uv_intra based on DC_PRED if the
1440   // appropriate speed flag is set.
1441   if (cpi->sf.use_uv_intra_rd_estimate) {
1442     rd_sbuv_dcpred(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv,
1443                    skip_uv, bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
1444   // Else do a proper rd search for each possible transform size that may
1445   // be considered in the main rd loop.
1446   } else {
1447     rd_pick_intra_sbuv_mode(cpi, x, ctx,
1448                             rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
1449                             bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize, max_tx_size);
1450   }
1451   *mode_uv = x->e_mbd.mi[0]->mbmi.uv_mode;
1452 }
1453
1454 static int cost_mv_ref(const VP9_COMP *cpi, PREDICTION_MODE mode,
1455                        int mode_context) {
1456   const MACROBLOCK *const x = &cpi->mb;
1457   const int segment_id = x->e_mbd.mi[0]->mbmi.segment_id;
1458
1459   // Don't account for mode here if segment skip is enabled.
1460   if (!vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
1461     assert(is_inter_mode(mode));
1462     return cpi->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
1463   } else {
1464     return 0;
1465   }
1466 }
1467
1468 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
1469                                 BLOCK_SIZE bsize,
1470                                 int_mv *frame_mv,
1471                                 int mi_row, int mi_col,
1472                                 int_mv single_newmv[MAX_REF_FRAMES],
1473                                 int *rate_mv);
1474
1475 static int set_and_cost_bmi_mvs(VP9_COMP *cpi, MACROBLOCKD *xd, int i,
1476                                 PREDICTION_MODE mode, int_mv this_mv[2],
1477                                 int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
1478                                 int_mv seg_mvs[MAX_REF_FRAMES],
1479                                 int_mv *best_ref_mv[2], const int *mvjcost,
1480                                 int *mvcost[2]) {
1481   MODE_INFO *const mic = xd->mi[0];
1482   const MB_MODE_INFO *const mbmi = &mic->mbmi;
1483   int thismvcost = 0;
1484   int idx, idy;
1485   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
1486   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
1487   const int is_compound = has_second_ref(mbmi);
1488
1489   switch (mode) {
1490     case NEWMV:
1491       this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
1492       thismvcost += vp9_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
1493                                     mvjcost, mvcost, MV_COST_WEIGHT_SUB);
1494       if (is_compound) {
1495         this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
1496         thismvcost += vp9_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv,
1497                                       mvjcost, mvcost, MV_COST_WEIGHT_SUB);
1498       }
1499       break;
1500     case NEARMV:
1501     case NEARESTMV:
1502       this_mv[0].as_int = frame_mv[mode][mbmi->ref_frame[0]].as_int;
1503       if (is_compound)
1504         this_mv[1].as_int = frame_mv[mode][mbmi->ref_frame[1]].as_int;
1505       break;
1506     case ZEROMV:
1507       this_mv[0].as_int = 0;
1508       if (is_compound)
1509         this_mv[1].as_int = 0;
1510       break;
1511     default:
1512       break;
1513   }
1514
1515   mic->bmi[i].as_mv[0].as_int = this_mv[0].as_int;
1516   if (is_compound)
1517     mic->bmi[i].as_mv[1].as_int = this_mv[1].as_int;
1518
1519   mic->bmi[i].as_mode = mode;
1520
1521   for (idy = 0; idy < num_4x4_blocks_high; ++idy)
1522     for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
1523       vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
1524                  &mic->bmi[i], sizeof(mic->bmi[i]));
1525
1526   return cost_mv_ref(cpi, mode, mbmi->mode_context[mbmi->ref_frame[0]]) +
1527             thismvcost;
1528 }
1529
1530 static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
1531                                        MACROBLOCK *x,
1532                                        int64_t best_yrd,
1533                                        int i,
1534                                        int *labelyrate,
1535                                        int64_t *distortion, int64_t *sse,
1536                                        ENTROPY_CONTEXT *ta,
1537                                        ENTROPY_CONTEXT *tl,
1538                                        int mi_row, int mi_col) {
1539   int k;
1540   MACROBLOCKD *xd = &x->e_mbd;
1541   struct macroblockd_plane *const pd = &xd->plane[0];
1542   struct macroblock_plane *const p = &x->plane[0];
1543   MODE_INFO *const mi = xd->mi[0];
1544   const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
1545   const int width = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
1546   const int height = 4 * num_4x4_blocks_high_lookup[plane_bsize];
1547   int idx, idy;
1548
1549   const uint8_t *const src = &p->src.buf[raster_block_offset(BLOCK_8X8, i,
1550                                                              p->src.stride)];
1551   uint8_t *const dst = &pd->dst.buf[raster_block_offset(BLOCK_8X8, i,
1552                                                         pd->dst.stride)];
1553   int64_t thisdistortion = 0, thissse = 0;
1554   int thisrate = 0, ref;
1555   const scan_order *so = &vp9_default_scan_orders[TX_4X4];
1556   const int is_compound = has_second_ref(&mi->mbmi);
1557   const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
1558
1559   for (ref = 0; ref < 1 + is_compound; ++ref) {
1560     const uint8_t *pre = &pd->pre[ref].buf[raster_block_offset(BLOCK_8X8, i,
1561                                                pd->pre[ref].stride)];
1562     vp9_build_inter_predictor(pre, pd->pre[ref].stride,
1563                               dst, pd->dst.stride,
1564                               &mi->bmi[i].as_mv[ref].as_mv,
1565                               &xd->block_refs[ref]->sf, width, height, ref,
1566                               kernel, MV_PRECISION_Q3,
1567                               mi_col * MI_SIZE + 4 * (i % 2),
1568                               mi_row * MI_SIZE + 4 * (i / 2));
1569   }
1570
1571   vp9_subtract_block(height, width,
1572                      raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
1573                      src, p->src.stride,
1574                      dst, pd->dst.stride);
1575
1576   k = i;
1577   for (idy = 0; idy < height / 4; ++idy) {
1578     for (idx = 0; idx < width / 4; ++idx) {
1579       int64_t ssz, rd, rd1, rd2;
1580       int16_t* coeff;
1581
1582       k += (idy * 2 + idx);
1583       coeff = BLOCK_OFFSET(p->coeff, k);
1584       x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
1585                     coeff, 8);
1586       vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
1587       thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
1588                                         16, &ssz);
1589       thissse += ssz;
1590       thisrate += cost_coeffs(x, 0, k, ta + (k & 1), tl + (k >> 1), TX_4X4,
1591                               so->scan, so->neighbors,
1592                               cpi->sf.use_fast_coef_costing);
1593       rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion >> 2);
1594       rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse >> 2);
1595       rd = MIN(rd1, rd2);
1596       if (rd >= best_yrd)
1597         return INT64_MAX;
1598     }
1599   }
1600
1601   *distortion = thisdistortion >> 2;
1602   *labelyrate = thisrate;
1603   *sse = thissse >> 2;
1604
1605   return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
1606 }
1607
1608 typedef struct {
1609   int eobs;
1610   int brate;
1611   int byrate;
1612   int64_t bdist;
1613   int64_t bsse;
1614   int64_t brdcost;
1615   int_mv mvs[2];
1616   ENTROPY_CONTEXT ta[2];
1617   ENTROPY_CONTEXT tl[2];
1618 } SEG_RDSTAT;
1619
1620 typedef struct {
1621   int_mv *ref_mv[2];
1622   int_mv mvp;
1623
1624   int64_t segment_rd;
1625   int r;
1626   int64_t d;
1627   int64_t sse;
1628   int segment_yrate;
1629   PREDICTION_MODE modes[4];
1630   SEG_RDSTAT rdstat[4][INTER_MODES];
1631   int mvthresh;
1632 } BEST_SEG_INFO;
1633
1634 static INLINE int mv_check_bounds(const MACROBLOCK *x, const MV *mv) {
1635   return (mv->row >> 3) < x->mv_row_min ||
1636          (mv->row >> 3) > x->mv_row_max ||
1637          (mv->col >> 3) < x->mv_col_min ||
1638          (mv->col >> 3) > x->mv_col_max;
1639 }
1640
1641 static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
1642   MB_MODE_INFO *const mbmi = &x->e_mbd.mi[0]->mbmi;
1643   struct macroblock_plane *const p = &x->plane[0];
1644   struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
1645
1646   p->src.buf = &p->src.buf[raster_block_offset(BLOCK_8X8, i, p->src.stride)];
1647   assert(((intptr_t)pd->pre[0].buf & 0x7) == 0);
1648   pd->pre[0].buf = &pd->pre[0].buf[raster_block_offset(BLOCK_8X8, i,
1649                                                        pd->pre[0].stride)];
1650   if (has_second_ref(mbmi))
1651     pd->pre[1].buf = &pd->pre[1].buf[raster_block_offset(BLOCK_8X8, i,
1652                                                          pd->pre[1].stride)];
1653 }
1654
1655 static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
1656                                   struct buf_2d orig_pre[2]) {
1657   MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
1658   x->plane[0].src = orig_src;
1659   x->e_mbd.plane[0].pre[0] = orig_pre[0];
1660   if (has_second_ref(mbmi))
1661     x->e_mbd.plane[0].pre[1] = orig_pre[1];
1662 }
1663
1664 static INLINE int mv_has_subpel(const MV *mv) {
1665   return (mv->row & 0x0F) || (mv->col & 0x0F);
1666 }
1667
1668 // Check if NEARESTMV/NEARMV/ZEROMV is the cheapest way encode zero motion.
1669 // TODO(aconverse): Find out if this is still productive then clean up or remove
1670 static int check_best_zero_mv(
1671     const VP9_COMP *cpi, const uint8_t mode_context[MAX_REF_FRAMES],
1672     int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
1673     int disable_inter_mode_mask, int this_mode,
1674     const MV_REFERENCE_FRAME ref_frames[2]) {
1675   if (!(disable_inter_mode_mask & (1 << INTER_OFFSET(ZEROMV))) &&
1676       (this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
1677       frame_mv[this_mode][ref_frames[0]].as_int == 0 &&
1678       (ref_frames[1] == NONE ||
1679        frame_mv[this_mode][ref_frames[1]].as_int == 0)) {
1680     int rfc = mode_context[ref_frames[0]];
1681     int c1 = cost_mv_ref(cpi, NEARMV, rfc);
1682     int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
1683     int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
1684
1685     if (this_mode == NEARMV) {
1686       if (c1 > c3) return 0;
1687     } else if (this_mode == NEARESTMV) {
1688       if (c2 > c3) return 0;
1689     } else {
1690       assert(this_mode == ZEROMV);
1691       if (ref_frames[1] == NONE) {
1692         if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0) ||
1693             (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0))
1694           return 0;
1695       } else {
1696         if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0 &&
1697              frame_mv[NEARESTMV][ref_frames[1]].as_int == 0) ||
1698             (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0 &&
1699              frame_mv[NEARMV][ref_frames[1]].as_int == 0))
1700           return 0;
1701       }
1702     }
1703   }
1704   return 1;
1705 }
1706
1707 static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
1708                                         const TileInfo * const tile,
1709                                         int_mv *best_ref_mv,
1710                                         int_mv *second_best_ref_mv,
1711                                         int64_t best_rd, int *returntotrate,
1712                                         int *returnyrate,
1713                                         int64_t *returndistortion,
1714                                         int *skippable, int64_t *psse,
1715                                         int mvthresh,
1716                                         int_mv seg_mvs[4][MAX_REF_FRAMES],
1717                                         BEST_SEG_INFO *bsi_buf, int filter_idx,
1718                                         int mi_row, int mi_col) {
1719   int i;
1720   BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
1721   MACROBLOCKD *xd = &x->e_mbd;
1722   MODE_INFO *mi = xd->mi[0];
1723   MB_MODE_INFO *mbmi = &mi->mbmi;
1724   int mode_idx;
1725   int k, br = 0, idx, idy;
1726   int64_t bd = 0, block_sse = 0;
1727   PREDICTION_MODE this_mode;
1728   VP9_COMMON *cm = &cpi->common;
1729   struct macroblock_plane *const p = &x->plane[0];
1730   struct macroblockd_plane *const pd = &xd->plane[0];
1731   const int label_count = 4;
1732   int64_t this_segment_rd = 0;
1733   int label_mv_thresh;
1734   int segmentyrate = 0;
1735   const BLOCK_SIZE bsize = mbmi->sb_type;
1736   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
1737   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
1738   ENTROPY_CONTEXT t_above[2], t_left[2];
1739   int subpelmv = 1, have_ref = 0;
1740   const int has_second_rf = has_second_ref(mbmi);
1741   const int disable_inter_mode_mask = cpi->sf.disable_inter_mode_mask[bsize];
1742
1743   vp9_zero(*bsi);
1744
1745   bsi->segment_rd = best_rd;
1746   bsi->ref_mv[0] = best_ref_mv;
1747   bsi->ref_mv[1] = second_best_ref_mv;
1748   bsi->mvp.as_int = best_ref_mv->as_int;
1749   bsi->mvthresh = mvthresh;
1750
1751   for (i = 0; i < 4; i++)
1752     bsi->modes[i] = ZEROMV;
1753
1754   vpx_memcpy(t_above, pd->above_context, sizeof(t_above));
1755   vpx_memcpy(t_left, pd->left_context, sizeof(t_left));
1756
1757   // 64 makes this threshold really big effectively
1758   // making it so that we very rarely check mvs on
1759   // segments.   setting this to 1 would make mv thresh
1760   // roughly equal to what it is for macroblocks
1761   label_mv_thresh = 1 * bsi->mvthresh / label_count;
1762
1763   // Segmentation method overheads
1764   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
1765     for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
1766       // TODO(jingning,rbultje): rewrite the rate-distortion optimization
1767       // loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop
1768       int_mv mode_mv[MB_MODE_COUNT][2];
1769       int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
1770       PREDICTION_MODE mode_selected = ZEROMV;
1771       int64_t best_rd = INT64_MAX;
1772       const int i = idy * 2 + idx;
1773       int ref;
1774
1775       for (ref = 0; ref < 1 + has_second_rf; ++ref) {
1776         const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
1777         frame_mv[ZEROMV][frame].as_int = 0;
1778         vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, i, ref, mi_row, mi_col,
1779                                       &frame_mv[NEARESTMV][frame],
1780                                       &frame_mv[NEARMV][frame]);
1781       }
1782
1783       // search for the best motion vector on this segment
1784       for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
1785         const struct buf_2d orig_src = x->plane[0].src;
1786         struct buf_2d orig_pre[2];
1787
1788         mode_idx = INTER_OFFSET(this_mode);
1789         bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
1790         if (disable_inter_mode_mask & (1 << mode_idx))
1791           continue;
1792
1793         if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
1794                                 disable_inter_mode_mask,
1795                                 this_mode, mbmi->ref_frame))
1796           continue;
1797
1798         vpx_memcpy(orig_pre, pd->pre, sizeof(orig_pre));
1799         vpx_memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
1800                    sizeof(bsi->rdstat[i][mode_idx].ta));
1801         vpx_memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
1802                    sizeof(bsi->rdstat[i][mode_idx].tl));
1803
1804         // motion search for newmv (single predictor case only)
1805         if (!has_second_rf && this_mode == NEWMV &&
1806             seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) {
1807           MV *const new_mv = &mode_mv[NEWMV][0].as_mv;
1808           int step_param = 0;
1809           int thissme, bestsme = INT_MAX;
1810           int sadpb = x->sadperbit4;
1811           MV mvp_full;
1812           int max_mv;
1813
1814           /* Is the best so far sufficiently good that we cant justify doing
1815            * and new motion search. */
1816           if (best_rd < label_mv_thresh)
1817             break;
1818
1819           if (!is_best_mode(cpi->oxcf.mode)) {
1820             // use previous block's result as next block's MV predictor.
1821             if (i > 0) {
1822               bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int;
1823               if (i == 2)
1824                 bsi->mvp.as_int = mi->bmi[i - 2].as_mv[0].as_int;
1825             }
1826           }
1827           if (i == 0)
1828             max_mv = x->max_mv_context[mbmi->ref_frame[0]];
1829           else
1830             max_mv = MAX(abs(bsi->mvp.as_mv.row), abs(bsi->mvp.as_mv.col)) >> 3;
1831
1832           if (cpi->sf.auto_mv_step_size && cm->show_frame) {
1833             // Take wtd average of the step_params based on the last frame's
1834             // max mv magnitude and the best ref mvs of the current block for
1835             // the given reference.
1836             step_param = (vp9_init_search_range(&cpi->sf, max_mv) +
1837                               cpi->mv_step_param) / 2;
1838           } else {
1839             step_param = cpi->mv_step_param;
1840           }
1841
1842           mvp_full.row = bsi->mvp.as_mv.row >> 3;
1843           mvp_full.col = bsi->mvp.as_mv.col >> 3;
1844
1845           if (cpi->sf.adaptive_motion_search && cm->show_frame) {
1846             mvp_full.row = x->pred_mv[mbmi->ref_frame[0]].as_mv.row >> 3;
1847             mvp_full.col = x->pred_mv[mbmi->ref_frame[0]].as_mv.col >> 3;
1848             step_param = MAX(step_param, 8);
1849           }
1850
1851           // adjust src pointer for this block
1852           mi_buf_shift(x, i);
1853
1854           vp9_set_mv_search_range(x, &bsi->ref_mv[0]->as_mv);
1855
1856           bestsme = full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
1857                                       sadpb, &bsi->ref_mv[0]->as_mv, new_mv,
1858                                       INT_MAX, 1);
1859
1860           // Should we do a full search (best quality only)
1861           if (is_best_mode(cpi->oxcf.mode)) {
1862             int_mv *const best_mv = &mi->bmi[i].as_mv[0];
1863             /* Check if mvp_full is within the range. */
1864             clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
1865                      x->mv_row_min, x->mv_row_max);
1866             thissme = cpi->full_search_sad(x, &mvp_full,
1867                                            sadpb, 16, &cpi->fn_ptr[bsize],
1868                                            &bsi->ref_mv[0]->as_mv,
1869                                            &best_mv->as_mv);
1870             if (thissme < bestsme) {
1871               bestsme = thissme;
1872               *new_mv = best_mv->as_mv;
1873             } else {
1874               // The full search result is actually worse so re-instate the
1875               // previous best vector
1876               best_mv->as_mv = *new_mv;
1877             }
1878           }
1879
1880           if (bestsme < INT_MAX) {
1881             int distortion;
1882             cpi->find_fractional_mv_step(x,
1883                                          new_mv,
1884                                          &bsi->ref_mv[0]->as_mv,
1885                                          cm->allow_high_precision_mv,
1886                                          x->errorperbit, &cpi->fn_ptr[bsize],
1887                                          cpi->sf.subpel_force_stop,
1888                                          cpi->sf.subpel_iters_per_step,
1889                                          x->nmvjointcost, x->mvcost,
1890                                          &distortion,
1891                                          &x->pred_sse[mbmi->ref_frame[0]]);
1892
1893             // save motion search result for use in compound prediction
1894             seg_mvs[i][mbmi->ref_frame[0]].as_mv = *new_mv;
1895           }
1896
1897           if (cpi->sf.adaptive_motion_search)
1898             x->pred_mv[mbmi->ref_frame[0]].as_mv = *new_mv;
1899
1900           // restore src pointers
1901           mi_buf_restore(x, orig_src, orig_pre);
1902         }
1903
1904         if (has_second_rf) {
1905           if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV ||
1906               seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV)
1907             continue;
1908         }
1909
1910         if (has_second_rf && this_mode == NEWMV &&
1911             mbmi->interp_filter == EIGHTTAP) {
1912           // adjust src pointers
1913           mi_buf_shift(x, i);
1914           if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
1915             int rate_mv;
1916             joint_motion_search(cpi, x, bsize, frame_mv[this_mode],
1917                                 mi_row, mi_col, seg_mvs[i],
1918                                 &rate_mv);
1919             seg_mvs[i][mbmi->ref_frame[0]].as_int =
1920                 frame_mv[this_mode][mbmi->ref_frame[0]].as_int;
1921             seg_mvs[i][mbmi->ref_frame[1]].as_int =
1922                 frame_mv[this_mode][mbmi->ref_frame[1]].as_int;
1923           }
1924           // restore src pointers
1925           mi_buf_restore(x, orig_src, orig_pre);
1926         }
1927
1928         bsi->rdstat[i][mode_idx].brate =
1929             set_and_cost_bmi_mvs(cpi, xd, i, this_mode, mode_mv[this_mode],
1930                                  frame_mv, seg_mvs[i], bsi->ref_mv,
1931                                  x->nmvjointcost, x->mvcost);
1932
1933         for (ref = 0; ref < 1 + has_second_rf; ++ref) {
1934           bsi->rdstat[i][mode_idx].mvs[ref].as_int =
1935               mode_mv[this_mode][ref].as_int;
1936           if (num_4x4_blocks_wide > 1)
1937             bsi->rdstat[i + 1][mode_idx].mvs[ref].as_int =
1938                 mode_mv[this_mode][ref].as_int;
1939           if (num_4x4_blocks_high > 1)
1940             bsi->rdstat[i + 2][mode_idx].mvs[ref].as_int =
1941                 mode_mv[this_mode][ref].as_int;
1942         }
1943
1944         // Trap vectors that reach beyond the UMV borders
1945         if (mv_check_bounds(x, &mode_mv[this_mode][0].as_mv) ||
1946             (has_second_rf &&
1947              mv_check_bounds(x, &mode_mv[this_mode][1].as_mv)))
1948           continue;
1949
1950         if (filter_idx > 0) {
1951           BEST_SEG_INFO *ref_bsi = bsi_buf;
1952           subpelmv = 0;
1953           have_ref = 1;
1954
1955           for (ref = 0; ref < 1 + has_second_rf; ++ref) {
1956             subpelmv |= mv_has_subpel(&mode_mv[this_mode][ref].as_mv);
1957             have_ref &= mode_mv[this_mode][ref].as_int ==
1958                 ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int;
1959           }
1960
1961           if (filter_idx > 1 && !subpelmv && !have_ref) {
1962             ref_bsi = bsi_buf + 1;
1963             have_ref = 1;
1964             for (ref = 0; ref < 1 + has_second_rf; ++ref)
1965               have_ref &= mode_mv[this_mode][ref].as_int ==
1966                   ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int;
1967           }
1968
1969           if (!subpelmv && have_ref &&
1970               ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
1971             vpx_memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
1972                        sizeof(SEG_RDSTAT));
1973             if (num_4x4_blocks_wide > 1)
1974               bsi->rdstat[i + 1][mode_idx].eobs =
1975                   ref_bsi->rdstat[i + 1][mode_idx].eobs;
1976             if (num_4x4_blocks_high > 1)
1977               bsi->rdstat[i + 2][mode_idx].eobs =
1978                   ref_bsi->rdstat[i + 2][mode_idx].eobs;
1979
1980             if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
1981               mode_selected = this_mode;
1982               best_rd = bsi->rdstat[i][mode_idx].brdcost;
1983             }
1984             continue;
1985           }
1986         }
1987
1988         bsi->rdstat[i][mode_idx].brdcost =
1989             encode_inter_mb_segment(cpi, x,
1990                                     bsi->segment_rd - this_segment_rd, i,
1991                                     &bsi->rdstat[i][mode_idx].byrate,
1992                                     &bsi->rdstat[i][mode_idx].bdist,
1993                                     &bsi->rdstat[i][mode_idx].bsse,
1994                                     bsi->rdstat[i][mode_idx].ta,
1995                                     bsi->rdstat[i][mode_idx].tl,
1996                                     mi_row, mi_col);
1997         if (bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
1998           bsi->rdstat[i][mode_idx].brdcost += RDCOST(x->rdmult, x->rddiv,
1999                                             bsi->rdstat[i][mode_idx].brate, 0);
2000           bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate;
2001           bsi->rdstat[i][mode_idx].eobs = p->eobs[i];
2002           if (num_4x4_blocks_wide > 1)
2003             bsi->rdstat[i + 1][mode_idx].eobs = p->eobs[i + 1];
2004           if (num_4x4_blocks_high > 1)
2005             bsi->rdstat[i + 2][mode_idx].eobs = p->eobs[i + 2];
2006         }
2007
2008         if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
2009           mode_selected = this_mode;
2010           best_rd = bsi->rdstat[i][mode_idx].brdcost;
2011         }
2012       } /*for each 4x4 mode*/
2013
2014       if (best_rd == INT64_MAX) {
2015         int iy, midx;
2016         for (iy = i + 1; iy < 4; ++iy)
2017           for (midx = 0; midx < INTER_MODES; ++midx)
2018             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
2019         bsi->segment_rd = INT64_MAX;
2020         return INT64_MAX;;
2021       }
2022
2023       mode_idx = INTER_OFFSET(mode_selected);
2024       vpx_memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above));
2025       vpx_memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left));
2026
2027       set_and_cost_bmi_mvs(cpi, xd, i, mode_selected, mode_mv[mode_selected],
2028                            frame_mv, seg_mvs[i], bsi->ref_mv, x->nmvjointcost,
2029                            x->mvcost);
2030
2031       br += bsi->rdstat[i][mode_idx].brate;
2032       bd += bsi->rdstat[i][mode_idx].bdist;
2033       block_sse += bsi->rdstat[i][mode_idx].bsse;
2034       segmentyrate += bsi->rdstat[i][mode_idx].byrate;
2035       this_segment_rd += bsi->rdstat[i][mode_idx].brdcost;
2036
2037       if (this_segment_rd > bsi->segment_rd) {
2038         int iy, midx;
2039         for (iy = i + 1; iy < 4; ++iy)
2040           for (midx = 0; midx < INTER_MODES; ++midx)
2041             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
2042         bsi->segment_rd = INT64_MAX;
2043         return INT64_MAX;;
2044       }
2045     }
2046   } /* for each label */
2047
2048   bsi->r = br;
2049   bsi->d = bd;
2050   bsi->segment_yrate = segmentyrate;
2051   bsi->segment_rd = this_segment_rd;
2052   bsi->sse = block_sse;
2053
2054   // update the coding decisions
2055   for (k = 0; k < 4; ++k)
2056     bsi->modes[k] = mi->bmi[k].as_mode;
2057
2058   if (bsi->segment_rd > best_rd)
2059     return INT64_MAX;
2060   /* set it to the best */
2061   for (i = 0; i < 4; i++) {
2062     mode_idx = INTER_OFFSET(bsi->modes[i]);
2063     mi->bmi[i].as_mv[0].as_int = bsi->rdstat[i][mode_idx].mvs[0].as_int;
2064     if (has_second_ref(mbmi))
2065       mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int;
2066     x->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs;
2067     mi->bmi[i].as_mode = bsi->modes[i];
2068   }
2069
2070   /*
2071    * used to set mbmi->mv.as_int
2072    */
2073   *returntotrate = bsi->r;
2074   *returndistortion = bsi->d;
2075   *returnyrate = bsi->segment_yrate;
2076   *skippable = vp9_is_skippable_in_plane(x, BLOCK_8X8, 0);
2077   *psse = bsi->sse;
2078   mbmi->mode = bsi->modes[3];
2079
2080   return bsi->segment_rd;
2081 }
2082
2083 static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
2084                     uint8_t *ref_y_buffer, int ref_y_stride,
2085                     int ref_frame, BLOCK_SIZE block_size ) {
2086   MACROBLOCKD *xd = &x->e_mbd;
2087   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
2088   int_mv this_mv;
2089   int i;
2090   int zero_seen = 0;
2091   int best_index = 0;
2092   int best_sad = INT_MAX;
2093   int this_sad = INT_MAX;
2094   int max_mv = 0;
2095
2096   uint8_t *src_y_ptr = x->plane[0].src.buf;
2097   uint8_t *ref_y_ptr;
2098   int row_offset, col_offset;
2099   int num_mv_refs = MAX_MV_REF_CANDIDATES +
2100                     (cpi->sf.adaptive_motion_search &&
2101                      cpi->common.show_frame &&
2102                      block_size < cpi->sf.max_partition_size);
2103
2104   int_mv pred_mv[3];
2105   pred_mv[0] = mbmi->ref_mvs[ref_frame][0];
2106   pred_mv[1] = mbmi->ref_mvs[ref_frame][1];
2107   pred_mv[2] = x->pred_mv[ref_frame];
2108
2109   // Get the sad for each candidate reference mv
2110   for (i = 0; i < num_mv_refs; i++) {
2111     this_mv.as_int = pred_mv[i].as_int;
2112
2113     max_mv = MAX(max_mv,
2114                  MAX(abs(this_mv.as_mv.row), abs(this_mv.as_mv.col)) >> 3);
2115     // only need to check zero mv once
2116     if (!this_mv.as_int && zero_seen)
2117       continue;
2118
2119     zero_seen = zero_seen || !this_mv.as_int;
2120
2121     row_offset = this_mv.as_mv.row >> 3;
2122     col_offset = this_mv.as_mv.col >> 3;
2123     ref_y_ptr = ref_y_buffer + (ref_y_stride * row_offset) + col_offset;
2124
2125     // Find sad for current vector.
2126     this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
2127                                            ref_y_ptr, ref_y_stride,
2128                                            0x7fffffff);
2129
2130     // Note if it is the best so far.
2131     if (this_sad < best_sad) {
2132       best_sad = this_sad;
2133       best_index = i;
2134     }
2135   }
2136
2137   // Note the index of the mv that worked best in the reference list.
2138   x->mv_best_ref_index[ref_frame] = best_index;
2139   x->max_mv_context[ref_frame] = max_mv;
2140   x->pred_mv_sad[ref_frame] = best_sad;
2141 }
2142
2143 static void estimate_ref_frame_costs(const VP9_COMMON *cm,
2144                                      const MACROBLOCKD *xd,
2145                                      int segment_id,
2146                                      unsigned int *ref_costs_single,
2147                                      unsigned int *ref_costs_comp,
2148                                      vp9_prob *comp_mode_p) {
2149   int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id,
2150                                              SEG_LVL_REF_FRAME);
2151   if (seg_ref_active) {
2152     vpx_memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single));
2153     vpx_memset(ref_costs_comp,   0, MAX_REF_FRAMES * sizeof(*ref_costs_comp));
2154     *comp_mode_p = 128;
2155   } else {
2156     vp9_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
2157     vp9_prob comp_inter_p = 128;
2158
2159     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
2160       comp_inter_p = vp9_get_reference_mode_prob(cm, xd);
2161       *comp_mode_p = comp_inter_p;
2162     } else {
2163       *comp_mode_p = 128;
2164     }
2165
2166     ref_costs_single[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
2167
2168     if (cm->reference_mode != COMPOUND_REFERENCE) {
2169       vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
2170       vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
2171       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
2172
2173       if (cm->reference_mode == REFERENCE_MODE_SELECT)
2174         base_cost += vp9_cost_bit(comp_inter_p, 0);
2175
2176       ref_costs_single[LAST_FRAME] = ref_costs_single[GOLDEN_FRAME] =
2177           ref_costs_single[ALTREF_FRAME] = base_cost;
2178       ref_costs_single[LAST_FRAME]   += vp9_cost_bit(ref_single_p1, 0);
2179       ref_costs_single[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p1, 1);
2180       ref_costs_single[ALTREF_FRAME] += vp9_cost_bit(ref_single_p1, 1);
2181       ref_costs_single[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p2, 0);
2182       ref_costs_single[ALTREF_FRAME] += vp9_cost_bit(ref_single_p2, 1);
2183     } else {
2184       ref_costs_single[LAST_FRAME]   = 512;
2185       ref_costs_single[GOLDEN_FRAME] = 512;
2186       ref_costs_single[ALTREF_FRAME] = 512;
2187     }
2188     if (cm->reference_mode != SINGLE_REFERENCE) {
2189       vp9_prob ref_comp_p = vp9_get_pred_prob_comp_ref_p(cm, xd);
2190       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
2191
2192       if (cm->reference_mode == REFERENCE_MODE_SELECT)
2193         base_cost += vp9_cost_bit(comp_inter_p, 1);
2194
2195       ref_costs_comp[LAST_FRAME]   = base_cost + vp9_cost_bit(ref_comp_p, 0);
2196       ref_costs_comp[GOLDEN_FRAME] = base_cost + vp9_cost_bit(ref_comp_p, 1);
2197     } else {
2198       ref_costs_comp[LAST_FRAME]   = 512;
2199       ref_costs_comp[GOLDEN_FRAME] = 512;
2200     }
2201   }
2202 }
2203
2204 static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
2205                          int mode_index,
2206                          int_mv *ref_mv,
2207                          int_mv *second_ref_mv,
2208                          int64_t comp_pred_diff[REFERENCE_MODES],
2209                          const int64_t tx_size_diff[TX_MODES],
2210                          int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]) {
2211   MACROBLOCKD *const xd = &x->e_mbd;
2212
2213   // Take a snapshot of the coding context so it can be
2214   // restored if we decide to encode this way
2215   ctx->skip = x->skip;
2216   ctx->best_mode_index = mode_index;
2217   ctx->mic = *xd->mi[0];
2218
2219   ctx->best_ref_mv[0].as_int = ref_mv->as_int;
2220   ctx->best_ref_mv[1].as_int = second_ref_mv->as_int;
2221
2222   ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
2223   ctx->comp_pred_diff   = (int)comp_pred_diff[COMPOUND_REFERENCE];
2224   ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
2225
2226   vpx_memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
2227   vpx_memcpy(ctx->best_filter_diff, best_filter_diff,
2228              sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS);
2229 }
2230
2231 static void setup_pred_block(const MACROBLOCKD *xd,
2232                              struct buf_2d dst[MAX_MB_PLANE],
2233                              const YV12_BUFFER_CONFIG *src,
2234                              int mi_row, int mi_col,
2235                              const struct scale_factors *scale,
2236                              const struct scale_factors *scale_uv) {
2237   int i;
2238
2239   dst[0].buf = src->y_buffer;
2240   dst[0].stride = src->y_stride;
2241   dst[1].buf = src->u_buffer;
2242   dst[2].buf = src->v_buffer;
2243   dst[1].stride = dst[2].stride = src->uv_stride;
2244 #if CONFIG_ALPHA
2245   dst[3].buf = src->alpha_buffer;
2246   dst[3].stride = src->alpha_stride;
2247 #endif
2248
2249   // TODO(jkoleszar): Make scale factors per-plane data
2250   for (i = 0; i < MAX_MB_PLANE; i++) {
2251     setup_pred_plane(dst + i, dst[i].buf, dst[i].stride, mi_row, mi_col,
2252                      i ? scale_uv : scale,
2253                      xd->plane[i].subsampling_x, xd->plane[i].subsampling_y);
2254   }
2255 }
2256
2257 void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
2258                             const TileInfo *const tile,
2259                             MV_REFERENCE_FRAME ref_frame,
2260                             BLOCK_SIZE block_size,
2261                             int mi_row, int mi_col,
2262                             int_mv frame_nearest_mv[MAX_REF_FRAMES],
2263                             int_mv frame_near_mv[MAX_REF_FRAMES],
2264                             struct buf_2d yv12_mb[4][MAX_MB_PLANE]) {
2265   const VP9_COMMON *cm = &cpi->common;
2266   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
2267   MACROBLOCKD *const xd = &x->e_mbd;
2268   MODE_INFO *const mi = xd->mi[0];
2269   int_mv *const candidates = mi->mbmi.ref_mvs[ref_frame];
2270   const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
2271
2272   // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
2273   // use the UV scaling factors.
2274   setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
2275
2276   // Gets an initial list of candidate vectors from neighbours and orders them
2277   vp9_find_mv_refs(cm, xd, tile, mi, ref_frame, candidates, mi_row, mi_col);
2278
2279   // Candidate refinement carried out at encoder and decoder
2280   vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
2281                         &frame_nearest_mv[ref_frame],
2282                         &frame_near_mv[ref_frame]);
2283
2284   // Further refinement that is encode side only to test the top few candidates
2285   // in full and choose the best as the centre point for subsequent searches.
2286   // The current implementation doesn't support scaling.
2287   if (!vp9_is_scaled(sf) && block_size >= BLOCK_8X8)
2288     mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride,
2289             ref_frame, block_size);
2290 }
2291
2292 const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
2293                                                    int ref_frame) {
2294   const VP9_COMMON *const cm = &cpi->common;
2295   const int ref_idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
2296   const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1];
2297   return (scaled_idx != ref_idx) ? &cm->frame_bufs[scaled_idx].buf : NULL;
2298 }
2299
2300 int vp9_get_switchable_rate(const VP9_COMP *cpi) {
2301   const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
2302   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
2303   const int ctx = vp9_get_pred_context_switchable_interp(xd);
2304   return SWITCHABLE_INTERP_RATE_FACTOR *
2305              cpi->switchable_interp_costs[ctx][mbmi->interp_filter];
2306 }
2307
2308 static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
2309                                  BLOCK_SIZE bsize,
2310                                  int mi_row, int mi_col,
2311                                  int_mv *tmp_mv, int *rate_mv) {
2312   MACROBLOCKD *xd = &x->e_mbd;
2313   const VP9_COMMON *cm = &cpi->common;
2314   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
2315   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
2316   int bestsme = INT_MAX;
2317   int step_param;
2318   int sadpb = x->sadperbit16;
2319   MV mvp_full;
2320   int ref = mbmi->ref_frame[0];
2321   MV ref_mv = mbmi->ref_mvs[ref][0].as_mv;
2322
2323   int tmp_col_min = x->mv_col_min;
2324   int tmp_col_max = x->mv_col_max;
2325   int tmp_row_min = x->mv_row_min;
2326   int tmp_row_max = x->mv_row_max;
2327
2328   const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
2329                                                                         ref);
2330
2331   MV pred_mv[3];
2332   pred_mv[0] = mbmi->ref_mvs[ref][0].as_mv;
2333   pred_mv[1] = mbmi->ref_mvs[ref][1].as_mv;
2334   pred_mv[2] = x->pred_mv[ref].as_mv;
2335
2336   if (scaled_ref_frame) {
2337     int i;
2338     // Swap out the reference frame for a version that's been scaled to
2339     // match the resolution of the current frame, allowing the existing
2340     // motion search code to be used without additional modifications.
2341     for (i = 0; i < MAX_MB_PLANE; i++)
2342       backup_yv12[i] = xd->plane[i].pre[0];
2343
2344     vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
2345   }
2346
2347   vp9_set_mv_search_range(x, &ref_mv);
2348
2349   // Work out the size of the first step in the mv step search.
2350   // 0 here is maximum length first step. 1 is MAX >> 1 etc.
2351   if (cpi->sf.auto_mv_step_size && cm->show_frame) {
2352     // Take wtd average of the step_params based on the last frame's
2353     // max mv magnitude and that based on the best ref mvs of the current
2354     // block for the given reference.
2355     step_param = (vp9_init_search_range(&cpi->sf, x->max_mv_context[ref]) +
2356                     cpi->mv_step_param) / 2;
2357   } else {
2358     step_param = cpi->mv_step_param;
2359   }
2360
2361   if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64 &&
2362       cm->show_frame) {
2363     int boffset = 2 * (b_width_log2(BLOCK_64X64) - MIN(b_height_log2(bsize),
2364                                                        b_width_log2(bsize)));
2365     step_param = MAX(step_param, boffset);
2366   }
2367
2368   if (cpi->sf.adaptive_motion_search) {
2369     int bwl = b_width_log2_lookup[bsize];
2370     int bhl = b_height_log2_lookup[bsize];
2371     int i;
2372     int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
2373
2374     if (tlevel < 5)
2375       step_param += 2;
2376
2377     for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
2378       if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
2379         x->pred_mv[ref].as_int = 0;
2380         tmp_mv->as_int = INVALID_MV;
2381
2382         if (scaled_ref_frame) {
2383           int i;
2384           for (i = 0; i < MAX_MB_PLANE; i++)
2385             xd->plane[i].pre[0] = backup_yv12[i];
2386         }
2387         return;
2388       }
2389     }
2390   }
2391
2392   mvp_full = pred_mv[x->mv_best_ref_index[ref]];
2393
2394   mvp_full.col >>= 3;
2395   mvp_full.row >>= 3;
2396
2397   bestsme = full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
2398                               &ref_mv, &tmp_mv->as_mv, INT_MAX, 1);
2399
2400   x->mv_col_min = tmp_col_min;
2401   x->mv_col_max = tmp_col_max;
2402   x->mv_row_min = tmp_row_min;
2403   x->mv_row_max = tmp_row_max;
2404
2405   if (bestsme < INT_MAX) {
2406     int dis;  /* TODO: use dis in distortion calculation later. */
2407     cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
2408                                  cm->allow_high_precision_mv,
2409                                  x->errorperbit,
2410                                  &cpi->fn_ptr[bsize],
2411                                  cpi->sf.subpel_force_stop,
2412                                  cpi->sf.subpel_iters_per_step,
2413                                  x->nmvjointcost, x->mvcost,
2414                                  &dis, &x->pred_sse[ref]);
2415   }
2416   *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
2417                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
2418
2419   if (cpi->sf.adaptive_motion_search && cm->show_frame)
2420     x->pred_mv[ref].as_int = tmp_mv->as_int;
2421
2422   if (scaled_ref_frame) {
2423     int i;
2424     for (i = 0; i < MAX_MB_PLANE; i++)
2425       xd->plane[i].pre[0] = backup_yv12[i];
2426   }
2427 }
2428
2429 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
2430                                 BLOCK_SIZE bsize,
2431                                 int_mv *frame_mv,
2432                                 int mi_row, int mi_col,
2433                                 int_mv single_newmv[MAX_REF_FRAMES],
2434                                 int *rate_mv) {
2435   const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
2436   const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
2437   MACROBLOCKD *xd = &x->e_mbd;
2438   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
2439   const int refs[2] = { mbmi->ref_frame[0],
2440                         mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
2441   int_mv ref_mv[2];
2442   int ite, ref;
2443   // Prediction buffer from second frame.
2444   uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
2445   const InterpKernel *kernel = vp9_get_interp_kernel(mbmi->interp_filter);
2446
2447   // Do joint motion search in compound mode to get more accurate mv.
2448   struct buf_2d backup_yv12[2][MAX_MB_PLANE];
2449   struct buf_2d scaled_first_yv12 = xd->plane[0].pre[0];
2450   int last_besterr[2] = {INT_MAX, INT_MAX};
2451   const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
2452     vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]),
2453     vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[1])
2454   };
2455
2456   for (ref = 0; ref < 2; ++ref) {
2457     ref_mv[ref] = mbmi->ref_mvs[refs[ref]][0];
2458
2459     if (scaled_ref_frame[ref]) {
2460       int i;
2461       // Swap out the reference frame for a version that's been scaled to
2462       // match the resolution of the current frame, allowing the existing
2463       // motion search code to be used without additional modifications.
2464       for (i = 0; i < MAX_MB_PLANE; i++)
2465         backup_yv12[ref][i] = xd->plane[i].pre[ref];
2466       vp9_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
2467                            NULL);
2468     }
2469
2470     frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int;
2471   }
2472
2473   // Allow joint search multiple times iteratively for each ref frame
2474   // and break out the search loop if it couldn't find better mv.
2475   for (ite = 0; ite < 4; ite++) {
2476     struct buf_2d ref_yv12[2];
2477     int bestsme = INT_MAX;
2478     int sadpb = x->sadperbit16;
2479     MV tmp_mv;
2480     int search_range = 3;
2481
2482     int tmp_col_min = x->mv_col_min;
2483     int tmp_col_max = x->mv_col_max;
2484     int tmp_row_min = x->mv_row_min;
2485     int tmp_row_max = x->mv_row_max;
2486     int id = ite % 2;
2487
2488     // Initialized here because of compiler problem in Visual Studio.
2489     ref_yv12[0] = xd->plane[0].pre[0];
2490     ref_yv12[1] = xd->plane[0].pre[1];
2491
2492     // Get pred block from second frame.
2493     vp9_build_inter_predictor(ref_yv12[!id].buf,
2494                               ref_yv12[!id].stride,
2495                               second_pred, pw,
2496                               &frame_mv[refs[!id]].as_mv,
2497                               &xd->block_refs[!id]->sf,
2498                               pw, ph, 0,
2499                               kernel, MV_PRECISION_Q3,
2500                               mi_col * MI_SIZE, mi_row * MI_SIZE);
2501
2502     // Compound motion search on first ref frame.
2503     if (id)
2504       xd->plane[0].pre[0] = ref_yv12[id];
2505     vp9_set_mv_search_range(x, &ref_mv[id].as_mv);
2506
2507     // Use mv result from single mode as mvp.
2508     tmp_mv = frame_mv[refs[id]].as_mv;
2509
2510     tmp_mv.col >>= 3;
2511     tmp_mv.row >>= 3;
2512
2513     // Small-range full-pixel motion search
2514     bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
2515                                        search_range,
2516                                        &cpi->fn_ptr[bsize],
2517                                        &ref_mv[id].as_mv, second_pred,
2518                                        pw, ph);
2519     if (bestsme < INT_MAX)
2520       bestsme = vp9_get_mvpred_av_var(x, &tmp_mv, &ref_mv[id].as_mv,
2521                                       second_pred, &cpi->fn_ptr[bsize], 1);
2522
2523     x->mv_col_min = tmp_col_min;
2524     x->mv_col_max = tmp_col_max;
2525     x->mv_row_min = tmp_row_min;
2526     x->mv_row_max = tmp_row_max;
2527
2528     if (bestsme < INT_MAX) {
2529       int dis; /* TODO: use dis in distortion calculation later. */
2530       unsigned int sse;
2531       bestsme = cpi->find_fractional_mv_step_comp(
2532           x, &tmp_mv,
2533           &ref_mv[id].as_mv,
2534           cpi->common.allow_high_precision_mv,
2535           x->errorperbit,
2536           &cpi->fn_ptr[bsize],
2537           0, cpi->sf.subpel_iters_per_step,
2538           x->nmvjointcost, x->mvcost,
2539           &dis, &sse, second_pred,
2540           pw, ph);
2541     }
2542
2543     if (id)
2544       xd->plane[0].pre[0] = scaled_first_yv12;
2545
2546     if (bestsme < last_besterr[id]) {
2547       frame_mv[refs[id]].as_mv = tmp_mv;
2548       last_besterr[id] = bestsme;
2549     } else {
2550       break;
2551     }
2552   }
2553
2554   *rate_mv = 0;
2555
2556   for (ref = 0; ref < 2; ++ref) {
2557     if (scaled_ref_frame[ref]) {
2558       // restore the predictor
2559       int i;
2560       for (i = 0; i < MAX_MB_PLANE; i++)
2561         xd->plane[i].pre[ref] = backup_yv12[ref][i];
2562     }
2563
2564     *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
2565                                 &mbmi->ref_mvs[refs[ref]][0].as_mv,
2566                                 x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
2567   }
2568
2569   vpx_free(second_pred);
2570 }
2571
2572 static INLINE void restore_dst_buf(MACROBLOCKD *xd,
2573                                    uint8_t *orig_dst[MAX_MB_PLANE],
2574                                    int orig_dst_stride[MAX_MB_PLANE]) {
2575   int i;
2576   for (i = 0; i < MAX_MB_PLANE; i++) {
2577     xd->plane[i].dst.buf = orig_dst[i];
2578     xd->plane[i].dst.stride = orig_dst_stride[i];
2579   }
2580 }
2581
2582 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
2583                                  BLOCK_SIZE bsize,
2584                                  int64_t txfm_cache[],
2585                                  int *rate2, int64_t *distortion,
2586                                  int *skippable,
2587                                  int *rate_y, int64_t *distortion_y,
2588                                  int *rate_uv, int64_t *distortion_uv,
2589                                  int *mode_excluded, int *disable_skip,
2590                                  INTERP_FILTER *best_filter,
2591                                  int_mv (*mode_mv)[MAX_REF_FRAMES],
2592                                  int mi_row, int mi_col,
2593                                  int_mv single_newmv[MAX_REF_FRAMES],
2594                                  int64_t *psse,
2595                                  const int64_t ref_best_rd) {
2596   VP9_COMMON *cm = &cpi->common;
2597   RD_OPT *rd_opt = &cpi->rd;
2598   MACROBLOCKD *xd = &x->e_mbd;
2599   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
2600   const int is_comp_pred = has_second_ref(mbmi);
2601   const int num_refs = is_comp_pred ? 2 : 1;
2602   const int this_mode = mbmi->mode;
2603   int_mv *frame_mv = mode_mv[this_mode];
2604   int i;
2605   int refs[2] = { mbmi->ref_frame[0],
2606     (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
2607   int_mv cur_mv[2];
2608   int64_t this_rd = 0;
2609   DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64);
2610   int pred_exists = 0;
2611   int intpel_mv;
2612   int64_t rd, best_rd = INT64_MAX;
2613   int best_needs_copy = 0;
2614   uint8_t *orig_dst[MAX_MB_PLANE];
2615   int orig_dst_stride[MAX_MB_PLANE];
2616   int rs = 0;
2617
2618   if (is_comp_pred) {
2619     if (frame_mv[refs[0]].as_int == INVALID_MV ||
2620         frame_mv[refs[1]].as_int == INVALID_MV)
2621       return INT64_MAX;
2622   }
2623
2624   if (this_mode == NEWMV) {
2625     int rate_mv;
2626     if (is_comp_pred) {
2627       // Initialize mv using single prediction mode result.
2628       frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
2629       frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
2630
2631       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
2632         joint_motion_search(cpi, x, bsize, frame_mv,
2633                             mi_row, mi_col, single_newmv, &rate_mv);
2634       } else {
2635         rate_mv  = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
2636                                    &mbmi->ref_mvs[refs[0]][0].as_mv,
2637                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
2638         rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv,
2639                                    &mbmi->ref_mvs[refs[1]][0].as_mv,
2640                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
2641       }
2642       *rate2 += rate_mv;
2643     } else {
2644       int_mv tmp_mv;
2645       single_motion_search(cpi, x, bsize, mi_row, mi_col,
2646                            &tmp_mv, &rate_mv);
2647       if (tmp_mv.as_int == INVALID_MV)
2648         return INT64_MAX;
2649       *rate2 += rate_mv;
2650       frame_mv[refs[0]].as_int =
2651           xd->mi[0]->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
2652       single_newmv[refs[0]].as_int = tmp_mv.as_int;
2653     }
2654   }
2655
2656   for (i = 0; i < num_refs; ++i) {
2657     cur_mv[i] = frame_mv[refs[i]];
2658     // Clip "next_nearest" so that it does not extend to far out of image
2659     if (this_mode != NEWMV)
2660       clamp_mv2(&cur_mv[i].as_mv, xd);
2661
2662     if (mv_check_bounds(x, &cur_mv[i].as_mv))
2663       return INT64_MAX;
2664     mbmi->mv[i].as_int = cur_mv[i].as_int;
2665   }
2666
2667   // do first prediction into the destination buffer. Do the next
2668   // prediction into a temporary buffer. Then keep track of which one
2669   // of these currently holds the best predictor, and use the other
2670   // one for future predictions. In the end, copy from tmp_buf to
2671   // dst if necessary.
2672   for (i = 0; i < MAX_MB_PLANE; i++) {
2673     orig_dst[i] = xd->plane[i].dst.buf;
2674     orig_dst_stride[i] = xd->plane[i].dst.stride;
2675   }
2676
2677   /* We don't include the cost of the second reference here, because there
2678    * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
2679    * words if you present them in that order, the second one is always known
2680    * if the first is known */
2681   *rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]);
2682
2683   if (!(*mode_excluded))
2684     *mode_excluded = is_comp_pred ? cm->reference_mode == SINGLE_REFERENCE
2685                                   : cm->reference_mode == COMPOUND_REFERENCE;
2686
2687   pred_exists = 0;
2688   // Are all MVs integer pel for Y and UV
2689   intpel_mv = !mv_has_subpel(&mbmi->mv[0].as_mv);
2690   if (is_comp_pred)
2691     intpel_mv &= !mv_has_subpel(&mbmi->mv[1].as_mv);
2692
2693   // Search for best switchable filter by checking the variance of
2694   // pred error irrespective of whether the filter will be used
2695   rd_opt->mask_filter = 0;
2696   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
2697     rd_opt->filter_cache[i] = INT64_MAX;
2698
2699   if (cm->interp_filter != BILINEAR) {
2700     *best_filter = EIGHTTAP;
2701     if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
2702       *best_filter = EIGHTTAP;
2703     } else {
2704       int newbest;
2705       int tmp_rate_sum = 0;
2706       int64_t tmp_dist_sum = 0;
2707
2708       for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
2709         int j;
2710         int64_t rs_rd;
2711         mbmi->interp_filter = i;
2712         rs = vp9_get_switchable_rate(cpi);
2713         rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
2714
2715         if (i > 0 && intpel_mv) {
2716           rd = RDCOST(x->rdmult, x->rddiv, tmp_rate_sum, tmp_dist_sum);
2717           rd_opt->filter_cache[i] = rd;
2718           rd_opt->filter_cache[SWITCHABLE_FILTERS] =
2719               MIN(rd_opt->filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
2720           if (cm->interp_filter == SWITCHABLE)
2721             rd += rs_rd;
2722           rd_opt->mask_filter = MAX(rd_opt->mask_filter, rd);
2723         } else {
2724           int rate_sum = 0;
2725           int64_t dist_sum = 0;
2726           if ((cm->interp_filter == SWITCHABLE &&
2727                (!i || best_needs_copy)) ||
2728               (cm->interp_filter != SWITCHABLE &&
2729                (cm->interp_filter == mbmi->interp_filter ||
2730                 (i == 0 && intpel_mv)))) {
2731             restore_dst_buf(xd, orig_dst, orig_dst_stride);
2732           } else {
2733             for (j = 0; j < MAX_MB_PLANE; j++) {
2734               xd->plane[j].dst.buf = tmp_buf + j * 64 * 64;
2735               xd->plane[j].dst.stride = 64;
2736             }
2737           }
2738           vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
2739           model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);
2740
2741           rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
2742           rd_opt->filter_cache[i] = rd;
2743           rd_opt->filter_cache[SWITCHABLE_FILTERS] =
2744               MIN(rd_opt->filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
2745           if (cm->interp_filter == SWITCHABLE)
2746             rd += rs_rd;
2747           rd_opt->mask_filter = MAX(rd_opt->mask_filter, rd);
2748
2749           if (i == 0 && intpel_mv) {
2750             tmp_rate_sum = rate_sum;
2751             tmp_dist_sum = dist_sum;
2752           }
2753         }
2754
2755         if (i == 0 && cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
2756           if (rd / 2 > ref_best_rd) {
2757             restore_dst_buf(xd, orig_dst, orig_dst_stride);
2758             return INT64_MAX;
2759           }
2760         }
2761         newbest = i == 0 || rd < best_rd;
2762
2763         if (newbest) {
2764           best_rd = rd;
2765           *best_filter = mbmi->interp_filter;
2766           if (cm->interp_filter == SWITCHABLE && i && !intpel_mv)
2767             best_needs_copy = !best_needs_copy;
2768         }
2769
2770         if ((cm->interp_filter == SWITCHABLE && newbest) ||
2771             (cm->interp_filter != SWITCHABLE &&
2772              cm->interp_filter == mbmi->interp_filter)) {
2773           pred_exists = 1;
2774         }
2775       }
2776       restore_dst_buf(xd, orig_dst, orig_dst_stride);
2777     }
2778   }
2779   // Set the appropriate filter
2780   mbmi->interp_filter = cm->interp_filter != SWITCHABLE ?
2781       cm->interp_filter : *best_filter;
2782   rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi) : 0;
2783
2784   if (pred_exists) {
2785     if (best_needs_copy) {
2786       // again temporarily set the buffers to local memory to prevent a memcpy
2787       for (i = 0; i < MAX_MB_PLANE; i++) {
2788         xd->plane[i].dst.buf = tmp_buf + i * 64 * 64;
2789         xd->plane[i].dst.stride = 64;
2790       }
2791     }
2792   } else {
2793     // Handles the special case when a filter that is not in the
2794     // switchable list (ex. bilinear, 6-tap) is indicated at the frame level
2795     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
2796   }
2797
2798   if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
2799     int tmp_rate;
2800     int64_t tmp_dist;
2801     model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist);
2802     rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
2803     // if current pred_error modeled rd is substantially more than the best
2804     // so far, do not bother doing full rd
2805     if (rd / 2 > ref_best_rd) {
2806       restore_dst_buf(xd, orig_dst, orig_dst_stride);
2807       return INT64_MAX;
2808     }
2809   }
2810
2811   if (cm->interp_filter == SWITCHABLE)
2812     *rate2 += vp9_get_switchable_rate(cpi);
2813
2814   if (!is_comp_pred) {
2815     if (!x->in_active_map) {
2816       if (psse)
2817         *psse = 0;
2818       *distortion = 0;
2819       x->skip = 1;
2820     } else if (cpi->allow_encode_breakout && x->encode_breakout) {
2821       const BLOCK_SIZE y_size = get_plane_block_size(bsize, &xd->plane[0]);
2822       const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
2823       unsigned int var, sse;
2824       // Skipping threshold for ac.
2825       unsigned int thresh_ac;
2826       // Set a maximum for threshold to avoid big PSNR loss in low bitrate case.
2827       // Use extreme low threshold for static frames to limit skipping.
2828       const unsigned int max_thresh = (cpi->allow_encode_breakout ==
2829                                       ENCODE_BREAKOUT_LIMITED) ? 128 : 36000;
2830       // The encode_breakout input
2831       const unsigned int min_thresh =
2832           MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
2833
2834       // Calculate threshold according to dequant value.
2835       thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
2836       thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);
2837
2838       var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride,
2839                                    xd->plane[0].dst.buf,
2840                                    xd->plane[0].dst.stride, &sse);
2841
2842       // Adjust threshold according to partition size.
2843       thresh_ac >>= 8 - (b_width_log2_lookup[bsize] +
2844           b_height_log2_lookup[bsize]);
2845
2846       // Y skipping condition checking
2847       if (sse < thresh_ac || sse == 0) {
2848         // Skipping threshold for dc
2849         unsigned int thresh_dc;
2850
2851         thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
2852
2853         // dc skipping checking
2854         if ((sse - var) < thresh_dc || sse == var) {
2855           unsigned int sse_u, sse_v;
2856           unsigned int var_u, var_v;
2857
2858           var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
2859                                           x->plane[1].src.stride,
2860                                           xd->plane[1].dst.buf,
2861                                           xd->plane[1].dst.stride, &sse_u);
2862
2863           // U skipping condition checking
2864           if ((sse_u * 4 < thresh_ac || sse_u == 0) &&
2865               (sse_u - var_u < thresh_dc || sse_u == var_u)) {
2866             var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
2867                                             x->plane[2].src.stride,
2868                                             xd->plane[2].dst.buf,
2869                                             xd->plane[2].dst.stride, &sse_v);
2870
2871             // V skipping condition checking
2872             if ((sse_v * 4 < thresh_ac || sse_v == 0) &&
2873                 (sse_v - var_v < thresh_dc || sse_v == var_v)) {
2874               x->skip = 1;
2875
2876               // The cost of skip bit needs to be added.
2877               *rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
2878
2879               // Scaling factor for SSE from spatial domain to frequency domain
2880               // is 16. Adjust distortion accordingly.
2881               *distortion_uv = (sse_u + sse_v) << 4;
2882               *distortion = (sse << 4) + *distortion_uv;
2883
2884               *disable_skip = 1;
2885               this_rd = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
2886             }
2887           }
2888         }
2889       }
2890     }
2891   }
2892
2893   if (!x->skip) {
2894     int skippable_y, skippable_uv;
2895     int64_t sseuv = INT64_MAX;
2896     int64_t rdcosty = INT64_MAX;
2897
2898     // Y cost and distortion
2899     inter_super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
2900                           bsize, txfm_cache, ref_best_rd);
2901
2902     if (*rate_y == INT_MAX) {
2903       *rate2 = INT_MAX;
2904       *distortion = INT64_MAX;
2905       restore_dst_buf(xd, orig_dst, orig_dst_stride);
2906       return INT64_MAX;
2907     }
2908
2909     *rate2 += *rate_y;
2910     *distortion += *distortion_y;
2911
2912     rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
2913     rdcosty = MIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
2914
2915     super_block_uvrd(cpi, x, rate_uv, distortion_uv, &skippable_uv, &sseuv,
2916                      bsize, ref_best_rd - rdcosty);
2917     if (*rate_uv == INT_MAX) {
2918       *rate2 = INT_MAX;
2919       *distortion = INT64_MAX;
2920       restore_dst_buf(xd, orig_dst, orig_dst_stride);
2921       return INT64_MAX;
2922     }
2923
2924     *psse += sseuv;
2925     *rate2 += *rate_uv;
2926     *distortion += *distortion_uv;
2927     *skippable = skippable_y && skippable_uv;
2928   }
2929
2930   restore_dst_buf(xd, orig_dst, orig_dst_stride);
2931   return this_rd;  // if 0, this will be re-calculated by caller
2932 }
2933
2934 static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
2935                            int max_plane) {
2936   struct macroblock_plane *const p = x->plane;
2937   struct macroblockd_plane *const pd = x->e_mbd.plane;
2938   int i;
2939
2940   for (i = 0; i < max_plane; ++i) {
2941     p[i].coeff    = ctx->coeff_pbuf[i][1];
2942     p[i].qcoeff  = ctx->qcoeff_pbuf[i][1];
2943     pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
2944     p[i].eobs    = ctx->eobs_pbuf[i][1];
2945
2946     ctx->coeff_pbuf[i][1]   = ctx->coeff_pbuf[i][0];
2947     ctx->qcoeff_pbuf[i][1]  = ctx->qcoeff_pbuf[i][0];
2948     ctx->dqcoeff_pbuf[i][1] = ctx->dqcoeff_pbuf[i][0];
2949     ctx->eobs_pbuf[i][1]    = ctx->eobs_pbuf[i][0];
2950
2951     ctx->coeff_pbuf[i][0]   = p[i].coeff;
2952     ctx->qcoeff_pbuf[i][0]  = p[i].qcoeff;
2953     ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff;
2954     ctx->eobs_pbuf[i][0]    = p[i].eobs;
2955   }
2956 }
2957
2958 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
2959                                int *returnrate, int64_t *returndist,
2960                                BLOCK_SIZE bsize,
2961                                PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
2962   VP9_COMMON *const cm = &cpi->common;
2963   MACROBLOCKD *const xd = &x->e_mbd;
2964   int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
2965   int y_skip = 0, uv_skip = 0;
2966   int64_t dist_y = 0, dist_uv = 0, tx_cache[TX_MODES] = { 0 };
2967   TX_SIZE max_uv_tx_size;
2968   x->skip_encode = 0;
2969   ctx->skip = 0;
2970   xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME;
2971
2972   if (bsize >= BLOCK_8X8) {
2973     if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
2974                                &dist_y, &y_skip, bsize, tx_cache,
2975                                best_rd) >= best_rd) {
2976       *returnrate = INT_MAX;
2977       return;
2978     }
2979     max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize);
2980     rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
2981                             &dist_uv, &uv_skip, bsize, max_uv_tx_size);
2982   } else {
2983     y_skip = 0;
2984     if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
2985                                      &dist_y, best_rd) >= best_rd) {
2986       *returnrate = INT_MAX;
2987       return;
2988     }
2989     max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize);
2990     rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
2991                             &dist_uv, &uv_skip, BLOCK_8X8, max_uv_tx_size);
2992   }
2993
2994   if (y_skip && uv_skip) {
2995     *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
2996                   vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
2997     *returndist = dist_y + dist_uv;
2998     vp9_zero(ctx->tx_rd_diff);
2999   } else {
3000     int i;
3001     *returnrate = rate_y + rate_uv + vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
3002     *returndist = dist_y + dist_uv;
3003     if (cpi->sf.tx_size_search_method == USE_FULL_RD)
3004       for (i = 0; i < TX_MODES; i++) {
3005         if (tx_cache[i] < INT64_MAX && tx_cache[cm->tx_mode] < INT64_MAX)
3006           ctx->tx_rd_diff[i] = tx_cache[i] - tx_cache[cm->tx_mode];
3007         else
3008           ctx->tx_rd_diff[i] = 0;
3009       }
3010   }
3011
3012   ctx->mic = *xd->mi[0];
3013 }
3014
3015 static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
3016                                       int thresh_fact) {
3017     return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX;
3018 }
3019
3020 // Updating rd_thresh_freq_fact[] here means that the different
3021 // partition/block sizes are handled independently based on the best
3022 // choice for the current partition. It may well be better to keep a scaled
3023 // best rd so far value and update rd_thresh_freq_fact based on the mode/size
3024 // combination that wins out.
3025 static void update_rd_thresh_fact(VP9_COMP *cpi, int bsize,
3026                                   int best_mode_index) {
3027   if (cpi->sf.adaptive_rd_thresh > 0) {
3028     const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
3029     int mode;
3030     for (mode = 0; mode < top_mode; ++mode) {
3031       int *const fact = &cpi->rd.thresh_freq_fact[bsize][mode];
3032
3033       if (mode == best_mode_index) {
3034         *fact -= (*fact >> 3);
3035       } else {
3036         *fact = MIN(*fact + RD_THRESH_INC,
3037                     cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
3038       }
3039     }
3040   }
3041 }
3042
3043 int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
3044                                   const TileInfo *const tile,
3045                                   int mi_row, int mi_col,
3046                                   int *returnrate,
3047                                   int64_t *returndistortion,
3048                                   BLOCK_SIZE bsize,
3049                                   PICK_MODE_CONTEXT *ctx,
3050                                   int64_t best_rd_so_far) {
3051   VP9_COMMON *const cm = &cpi->common;
3052   RD_OPT *const rd_opt = &cpi->rd;
3053   MACROBLOCKD *const xd = &x->e_mbd;
3054   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
3055   const struct segmentation *const seg = &cm->seg;
3056   PREDICTION_MODE this_mode;
3057   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
3058   unsigned char segment_id = mbmi->segment_id;
3059   int comp_pred, i;
3060   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
3061   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
3062   int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
3063   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
3064                                     VP9_ALT_FLAG };
3065   int64_t best_rd = best_rd_so_far;
3066   int64_t best_tx_rd[TX_MODES];
3067   int64_t best_tx_diff[TX_MODES];
3068   int64_t best_pred_diff[REFERENCE_MODES];
3069   int64_t best_pred_rd[REFERENCE_MODES];
3070   int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
3071   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
3072   MB_MODE_INFO best_mbmode = { 0 };
3073   int mode_index, best_mode_index = -1;
3074   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
3075   vp9_prob comp_mode_p;
3076   int64_t best_intra_rd = INT64_MAX;
3077   int64_t best_inter_rd = INT64_MAX;
3078   PREDICTION_MODE best_intra_mode = DC_PRED;
3079   MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
3080   INTERP_FILTER tmp_best_filter = SWITCHABLE;
3081   int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
3082   int64_t dist_uv[TX_SIZES];
3083   int skip_uv[TX_SIZES];
3084   PREDICTION_MODE mode_uv[TX_SIZES];
3085   int64_t mode_distortions[MB_MODE_COUNT] = {-1};
3086   int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q);
3087   const int bws = num_8x8_blocks_wide_lookup[bsize] / 2;
3088   const int bhs = num_8x8_blocks_high_lookup[bsize] / 2;
3089   int best_skip2 = 0;
3090   int mode_skip_mask = 0;
3091   int mode_skip_start = cpi->sf.mode_skip_start + 1;
3092   const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
3093   const int *const rd_thresh_freq_fact = rd_opt->thresh_freq_fact[bsize];
3094   const int mode_search_skip_flags = cpi->sf.mode_search_skip_flags;
3095   const int intra_y_mode_mask =
3096       cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]];
3097   int disable_inter_mode_mask = cpi->sf.disable_inter_mode_mask[bsize];
3098
3099   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
3100
3101   estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
3102                            &comp_mode_p);
3103
3104   for (i = 0; i < REFERENCE_MODES; ++i)
3105     best_pred_rd[i] = INT64_MAX;
3106   for (i = 0; i < TX_MODES; i++)
3107     best_tx_rd[i] = INT64_MAX;
3108   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
3109     best_filter_rd[i] = INT64_MAX;
3110   for (i = 0; i < TX_SIZES; i++)
3111     rate_uv_intra[i] = INT_MAX;
3112   for (i = 0; i < MAX_REF_FRAMES; ++i)
3113     x->pred_sse[i] = INT_MAX;
3114
3115   *returnrate = INT_MAX;
3116
3117   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
3118     x->pred_mv_sad[ref_frame] = INT_MAX;
3119     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
3120       vp9_setup_buffer_inter(cpi, x, tile,
3121                              ref_frame, bsize, mi_row, mi_col,
3122                              frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
3123     }
3124     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
3125     frame_mv[ZEROMV][ref_frame].as_int = 0;
3126   }
3127
3128   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
3129     // All modes from vp9_mode_order that use this frame as any ref
3130     static const int ref_frame_mask_all[] = {
3131         0x0, 0x123291, 0x25c444, 0x39b722
3132     };
3133     // Fixed mv modes (NEARESTMV, NEARMV, ZEROMV) from vp9_mode_order that use
3134     // this frame as their primary ref
3135     static const int ref_frame_mask_fixedmv[] = {
3136         0x0, 0x121281, 0x24c404, 0x080102
3137     };
3138     if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
3139       // Skip modes for missing references
3140       mode_skip_mask |= ref_frame_mask_all[ref_frame];
3141     } else if (cpi->sf.reference_masking) {
3142       for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
3143         // Skip fixed mv modes for poor references
3144         if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
3145           mode_skip_mask |= ref_frame_mask_fixedmv[ref_frame];
3146           break;
3147         }
3148       }
3149     }
3150     // If the segment reference frame feature is enabled....
3151     // then do nothing if the current ref frame is not allowed..
3152     if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
3153         vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
3154       mode_skip_mask |= ref_frame_mask_all[ref_frame];
3155     }
3156   }
3157
3158   // If the segment skip feature is enabled....
3159   // then do nothing if the current mode is not allowed..
3160   if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
3161     const int inter_non_zero_mode_mask = 0x1F7F7;
3162     mode_skip_mask |= inter_non_zero_mode_mask;
3163   }
3164
3165   // Disable this drop out case if the ref frame
3166   // segment level feature is enabled for this segment. This is to
3167   // prevent the possibility that we end up unable to pick any mode.
3168   if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
3169     // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
3170     // unless ARNR filtering is enabled in which case we want
3171     // an unfiltered alternative. We allow near/nearest as well
3172     // because they may result in zero-zero MVs but be cheaper.
3173     if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
3174       mode_skip_mask =
3175           ~((1 << THR_NEARESTA) | (1 << THR_NEARA) | (1 << THR_ZEROA));
3176       if (frame_mv[NEARMV][ALTREF_FRAME].as_int != 0)
3177         mode_skip_mask |= (1 << THR_NEARA);
3178       if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != 0)
3179         mode_skip_mask |= (1 << THR_NEARESTA);
3180     }
3181   }
3182
3183   // TODO(JBB): This is to make up for the fact that we don't have sad
3184   // functions that work when the block size reads outside the umv.  We
3185   // should fix this either by making the motion search just work on
3186   // a representative block in the boundary ( first ) and then implement a
3187   // function that does sads when inside the border..
3188   if ((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) {
3189     const int new_modes_mask =
3190         (1 << THR_NEWMV) | (1 << THR_NEWG) | (1 << THR_NEWA) |
3191         (1 << THR_COMP_NEWLA) | (1 << THR_COMP_NEWGA);
3192     mode_skip_mask |= new_modes_mask;
3193   }
3194
3195   if (bsize > cpi->sf.max_intra_bsize) {
3196     mode_skip_mask |= 0xFF30808;
3197   }
3198
3199   if (!x->in_active_map) {
3200     int mode_index;
3201     assert(cpi->ref_frame_flags & VP9_LAST_FLAG);
3202     if (frame_mv[NEARESTMV][LAST_FRAME].as_int == 0)
3203       mode_index = THR_NEARESTMV;
3204     else if (frame_mv[NEARMV][LAST_FRAME].as_int == 0)
3205       mode_index = THR_NEARMV;
3206     else
3207       mode_index = THR_ZEROMV;
3208     mode_skip_mask = ~(1 << mode_index);
3209     mode_skip_start = MAX_MODES;
3210     disable_inter_mode_mask = 0;
3211   }
3212
3213   for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
3214     int mode_excluded = 0;
3215     int64_t this_rd = INT64_MAX;
3216     int disable_skip = 0;
3217     int compmode_cost = 0;
3218     int rate2 = 0, rate_y = 0, rate_uv = 0;
3219     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
3220     int skippable = 0;
3221     int64_t tx_cache[TX_MODES];
3222     int i;
3223     int this_skip2 = 0;
3224     int64_t total_sse = INT64_MAX;
3225     int early_term = 0;
3226
3227     // Look at the reference frame of the best mode so far and set the
3228     // skip mask to look at a subset of the remaining modes.
3229     if (mode_index == mode_skip_start && best_mode_index >= 0) {
3230       switch (vp9_mode_order[best_mode_index].ref_frame[0]) {
3231         case INTRA_FRAME:
3232           break;
3233         case LAST_FRAME:
3234           mode_skip_mask |= LAST_FRAME_MODE_MASK;
3235           break;
3236         case GOLDEN_FRAME:
3237           mode_skip_mask |= GOLDEN_FRAME_MODE_MASK;
3238           break;
3239         case ALTREF_FRAME:
3240           mode_skip_mask |= ALT_REF_MODE_MASK;
3241           break;
3242         case NONE:
3243         case MAX_REF_FRAMES:
3244           assert(0 && "Invalid Reference frame");
3245       }
3246     }
3247     if (mode_skip_mask & (1 << mode_index))
3248       continue;
3249
3250     // Test best rd so far against threshold for trying this mode.
3251     if (rd_less_than_thresh(best_rd, rd_threshes[mode_index],
3252         rd_thresh_freq_fact[mode_index]))
3253       continue;
3254
3255     this_mode = vp9_mode_order[mode_index].mode;
3256     ref_frame = vp9_mode_order[mode_index].ref_frame[0];
3257     if (ref_frame != INTRA_FRAME &&
3258         disable_inter_mode_mask & (1 << INTER_OFFSET(this_mode)))
3259       continue;
3260     second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
3261
3262     comp_pred = second_ref_frame > INTRA_FRAME;
3263     if (comp_pred) {
3264       if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
3265           best_mode_index >=0 &&
3266           vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME)
3267         continue;
3268       if ((mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH) &&
3269           ref_frame != best_inter_ref_frame &&
3270           second_ref_frame != best_inter_ref_frame)
3271         continue;
3272       mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
3273     } else {
3274       if (ref_frame != INTRA_FRAME)
3275         mode_excluded = cm->reference_mode == COMPOUND_REFERENCE;
3276     }
3277
3278     if (ref_frame == INTRA_FRAME) {
3279       if (!(intra_y_mode_mask & (1 << this_mode)))
3280         continue;
3281       if (this_mode != DC_PRED) {
3282         // Disable intra modes other than DC_PRED for blocks with low variance
3283         // Threshold for intra skipping based on source variance
3284         // TODO(debargha): Specialize the threshold for super block sizes
3285         const unsigned int skip_intra_var_thresh = 64;
3286         if ((mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
3287             x->source_variance < skip_intra_var_thresh)
3288           continue;
3289         // Only search the oblique modes if the best so far is
3290         // one of the neighboring directional modes
3291         if ((mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
3292             (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
3293           if (best_mode_index >= 0 &&
3294               vp9_mode_order[best_mode_index].ref_frame[0] > INTRA_FRAME)
3295             continue;
3296         }
3297         if (mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
3298           if (conditional_skipintra(this_mode, best_intra_mode))
3299               continue;
3300         }
3301       }
3302     } else {
3303       if (x->in_active_map &&
3304           !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
3305         const MV_REFERENCE_FRAME ref_frames[2] = {ref_frame, second_ref_frame};
3306         if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
3307                                 disable_inter_mode_mask, this_mode, ref_frames))
3308           continue;
3309       }
3310     }
3311
3312     mbmi->mode = this_mode;
3313     mbmi->uv_mode = x->in_active_map ? DC_PRED : this_mode;
3314     mbmi->ref_frame[0] = ref_frame;
3315     mbmi->ref_frame[1] = second_ref_frame;
3316     // Evaluate all sub-pel filters irrespective of whether we can use
3317     // them for this frame.
3318     mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
3319                                                           : cm->interp_filter;
3320     x->skip = 0;
3321     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
3322
3323     // Select prediction reference frames.
3324     for (i = 0; i < MAX_MB_PLANE; i++) {
3325       xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
3326       if (comp_pred)
3327         xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
3328     }
3329
3330     for (i = 0; i < TX_MODES; ++i)
3331       tx_cache[i] = INT64_MAX;
3332
3333     if (ref_frame == INTRA_FRAME) {
3334       TX_SIZE uv_tx;
3335       intra_super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
3336                             bsize, tx_cache, best_rd);
3337
3338       if (rate_y == INT_MAX)
3339         continue;
3340
3341       uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize);
3342       if (rate_uv_intra[uv_tx] == INT_MAX) {
3343         choose_intra_uv_mode(cpi, ctx, bsize, uv_tx,
3344                              &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx],
3345                              &dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]);
3346       }
3347
3348       rate_uv = rate_uv_tokenonly[uv_tx];
3349       distortion_uv = dist_uv[uv_tx];
3350       skippable = skippable && skip_uv[uv_tx];
3351       mbmi->uv_mode = mode_uv[uv_tx];
3352
3353       rate2 = rate_y + cpi->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
3354       if (this_mode != DC_PRED && this_mode != TM_PRED)
3355         rate2 += intra_cost_penalty;
3356       distortion2 = distortion_y + distortion_uv;
3357     } else {
3358       this_rd = handle_inter_mode(cpi, x, bsize,
3359                                   tx_cache,
3360                                   &rate2, &distortion2, &skippable,
3361                                   &rate_y, &distortion_y,
3362                                   &rate_uv, &distortion_uv,
3363                                   &mode_excluded, &disable_skip,
3364                                   &tmp_best_filter, frame_mv,
3365                                   mi_row, mi_col,
3366                                   single_newmv, &total_sse, best_rd);
3367       if (this_rd == INT64_MAX)
3368         continue;
3369
3370       compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
3371
3372       if (cm->reference_mode == REFERENCE_MODE_SELECT)
3373         rate2 += compmode_cost;
3374     }
3375
3376     // Estimate the reference frame signaling cost and add it
3377     // to the rolling cost variable.
3378     if (comp_pred) {
3379       rate2 += ref_costs_comp[ref_frame];
3380     } else {
3381       rate2 += ref_costs_single[ref_frame];
3382     }
3383
3384     if (!disable_skip) {
3385       // Test for the condition where skip block will be activated
3386       // because there are no non zero coefficients and make any
3387       // necessary adjustment for rate. Ignore if skip is coded at
3388       // segment level as the cost wont have been added in.
3389       // Is Mb level skip allowed (i.e. not coded at segment level).
3390       const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id,
3391                                                          SEG_LVL_SKIP);
3392
3393       if (skippable) {
3394         // Back out the coefficient coding costs
3395         rate2 -= (rate_y + rate_uv);
3396         // for best yrd calculation
3397         rate_uv = 0;
3398
3399         if (mb_skip_allowed) {
3400           int prob_skip_cost;
3401
3402           // Cost the skip mb case
3403           vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
3404           if (skip_prob) {
3405             prob_skip_cost = vp9_cost_bit(skip_prob, 1);
3406             rate2 += prob_skip_cost;
3407           }
3408         }
3409       } else if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) {
3410         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
3411             RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
3412           // Add in the cost of the no skip flag.
3413           rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
3414         } else {
3415           // FIXME(rbultje) make this work for splitmv also
3416           rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
3417           distortion2 = total_sse;
3418           assert(total_sse >= 0);
3419           rate2 -= (rate_y + rate_uv);
3420           rate_y = 0;
3421           rate_uv = 0;
3422           this_skip2 = 1;
3423         }
3424       } else if (mb_skip_allowed) {
3425         // Add in the cost of the no skip flag.
3426         rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
3427       }
3428
3429       // Calculate the final RD estimate for this mode.
3430       this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
3431     }
3432
3433     if (ref_frame == INTRA_FRAME) {
3434     // Keep record of best intra rd
3435       if (this_rd < best_intra_rd) {
3436         best_intra_rd = this_rd;
3437         best_intra_mode = mbmi->mode;
3438       }
3439     } else {
3440       // Keep record of best inter rd with single reference
3441       if (!comp_pred && !mode_excluded && this_rd < best_inter_rd) {
3442         best_inter_rd = this_rd;
3443         best_inter_ref_frame = ref_frame;
3444       }
3445     }
3446
3447     if (!disable_skip && ref_frame == INTRA_FRAME) {
3448       for (i = 0; i < REFERENCE_MODES; ++i)
3449         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
3450       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
3451         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
3452     }
3453
3454     // Store the respective mode distortions for later use.
3455     if (mode_distortions[this_mode] == -1
3456         || distortion2 < mode_distortions[this_mode]) {
3457       mode_distortions[this_mode] = distortion2;
3458     }
3459
3460     // Did this mode help.. i.e. is it the new best mode
3461     if (this_rd < best_rd || x->skip) {
3462       int max_plane = MAX_MB_PLANE;
3463       if (!mode_excluded) {
3464         // Note index of best mode so far
3465         best_mode_index = mode_index;
3466
3467         if (ref_frame == INTRA_FRAME) {
3468           /* required for left and above block mv */
3469           mbmi->mv[0].as_int = 0;
3470           max_plane = 1;
3471         }
3472
3473         *returnrate = rate2;
3474         *returndistortion = distortion2;
3475         best_rd = this_rd;
3476         best_mbmode = *mbmi;
3477         best_skip2 = this_skip2;
3478         if (!x->select_txfm_size)
3479           swap_block_ptr(x, ctx, max_plane);
3480         vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
3481                    sizeof(uint8_t) * ctx->num_4x4_blk);
3482
3483         // TODO(debargha): enhance this test with a better distortion prediction
3484         // based on qp, activity mask and history
3485         if ((mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
3486             (mode_index > MIN_EARLY_TERM_INDEX)) {
3487           const int qstep = xd->plane[0].dequant[1];
3488           // TODO(debargha): Enhance this by specializing for each mode_index
3489           int scale = 4;
3490           if (x->source_variance < UINT_MAX) {
3491             const int var_adjust = (x->source_variance < 16);
3492             scale -= var_adjust;
3493           }
3494           if (ref_frame > INTRA_FRAME &&
3495               distortion2 * scale < qstep * qstep) {
3496             early_term = 1;
3497           }
3498         }
3499       }
3500     }
3501
3502     /* keep record of best compound/single-only prediction */
3503     if (!disable_skip && ref_frame != INTRA_FRAME) {
3504       int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
3505
3506       if (cm->reference_mode == REFERENCE_MODE_SELECT) {
3507         single_rate = rate2 - compmode_cost;
3508         hybrid_rate = rate2;
3509       } else {
3510         single_rate = rate2;
3511         hybrid_rate = rate2 + compmode_cost;
3512       }
3513
3514       single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
3515       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
3516
3517       if (!comp_pred) {
3518         if (single_rd < best_pred_rd[SINGLE_REFERENCE]) {
3519           best_pred_rd[SINGLE_REFERENCE] = single_rd;
3520         }
3521       } else {
3522         if (single_rd < best_pred_rd[COMPOUND_REFERENCE]) {
3523           best_pred_rd[COMPOUND_REFERENCE] = single_rd;
3524         }
3525       }
3526       if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
3527         best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
3528
3529       /* keep record of best filter type */
3530       if (!mode_excluded && cm->interp_filter != BILINEAR) {
3531         int64_t ref = rd_opt->filter_cache[cm->interp_filter == SWITCHABLE ?
3532                               SWITCHABLE_FILTERS : cm->interp_filter];
3533
3534         for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
3535           int64_t adj_rd;
3536           if (ref == INT64_MAX)
3537             adj_rd = 0;
3538           else if (rd_opt->filter_cache[i] == INT64_MAX)
3539             // when early termination is triggered, the encoder does not have
3540             // access to the rate-distortion cost. it only knows that the cost
3541             // should be above the maximum valid value. hence it takes the known
3542             // maximum plus an arbitrary constant as the rate-distortion cost.
3543             adj_rd = rd_opt->mask_filter - ref + 10;
3544           else
3545             adj_rd = rd_opt->filter_cache[i] - ref;
3546
3547           adj_rd += this_rd;
3548           best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
3549         }
3550       }
3551     }
3552
3553     /* keep record of best txfm size */
3554     if (bsize < BLOCK_32X32) {
3555       if (bsize < BLOCK_16X16)
3556         tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8];
3557
3558       tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16];
3559     }
3560     if (!mode_excluded && this_rd != INT64_MAX) {
3561       for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) {
3562         int64_t adj_rd = INT64_MAX;
3563         adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode];
3564
3565         if (adj_rd < best_tx_rd[i])
3566           best_tx_rd[i] = adj_rd;
3567       }
3568     }
3569
3570     if (early_term)
3571       break;
3572
3573     if (x->skip && !comp_pred)
3574       break;
3575   }
3576
3577   if (best_mode_index < 0 || best_rd >= best_rd_so_far)
3578     return INT64_MAX;
3579
3580   // If we used an estimate for the uv intra rd in the loop above...
3581   if (cpi->sf.use_uv_intra_rd_estimate) {
3582     // Do Intra UV best rd mode selection if best mode choice above was intra.
3583     if (vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME) {
3584       TX_SIZE uv_tx_size;
3585       *mbmi = best_mbmode;
3586       uv_tx_size = get_uv_tx_size(mbmi);
3587       rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
3588                               &rate_uv_tokenonly[uv_tx_size],
3589                               &dist_uv[uv_tx_size],
3590                               &skip_uv[uv_tx_size],
3591                               bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize,
3592                               uv_tx_size);
3593     }
3594   }
3595
3596   assert((cm->interp_filter == SWITCHABLE) ||
3597          (cm->interp_filter == best_mbmode.interp_filter) ||
3598          !is_inter_block(&best_mbmode));
3599
3600   update_rd_thresh_fact(cpi, bsize, best_mode_index);
3601
3602   // macroblock modes
3603   *mbmi = best_mbmode;
3604   x->skip |= best_skip2;
3605
3606   for (i = 0; i < REFERENCE_MODES; ++i) {
3607     if (best_pred_rd[i] == INT64_MAX)
3608       best_pred_diff[i] = INT_MIN;
3609     else
3610       best_pred_diff[i] = best_rd - best_pred_rd[i];
3611   }
3612
3613   if (!x->skip) {
3614     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
3615       if (best_filter_rd[i] == INT64_MAX)
3616         best_filter_diff[i] = 0;
3617       else
3618         best_filter_diff[i] = best_rd - best_filter_rd[i];
3619     }
3620     if (cm->interp_filter == SWITCHABLE)
3621       assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
3622     for (i = 0; i < TX_MODES; i++) {
3623       if (best_tx_rd[i] == INT64_MAX)
3624         best_tx_diff[i] = 0;
3625       else
3626         best_tx_diff[i] = best_rd - best_tx_rd[i];
3627     }
3628   } else {
3629     vp9_zero(best_filter_diff);
3630     vp9_zero(best_tx_diff);
3631   }
3632
3633   if (!x->in_active_map) {
3634     assert(mbmi->ref_frame[0] == LAST_FRAME);
3635     assert(mbmi->ref_frame[1] == NONE);
3636     assert(mbmi->mode == NEARESTMV ||
3637            mbmi->mode == NEARMV ||
3638            mbmi->mode == ZEROMV);
3639     assert(frame_mv[mbmi->mode][LAST_FRAME].as_int == 0);
3640     assert(mbmi->mode == mbmi->uv_mode);
3641   }
3642
3643   set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
3644   store_coding_context(x, ctx, best_mode_index,
3645                        &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
3646                        &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
3647                                       mbmi->ref_frame[1]][0],
3648                        best_pred_diff, best_tx_diff, best_filter_diff);
3649
3650   return best_rd;
3651 }
3652
3653
3654 int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
3655                                       const TileInfo *const tile,
3656                                       int mi_row, int mi_col,
3657                                       int *returnrate,
3658                                       int64_t *returndistortion,
3659                                       BLOCK_SIZE bsize,
3660                                       PICK_MODE_CONTEXT *ctx,
3661                                       int64_t best_rd_so_far) {
3662   VP9_COMMON *const cm = &cpi->common;
3663   RD_OPT *const rd_opt = &cpi->rd;
3664   MACROBLOCKD *const xd = &x->e_mbd;
3665   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
3666   const struct segmentation *const seg = &cm->seg;
3667   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
3668   unsigned char segment_id = mbmi->segment_id;
3669   int comp_pred, i;
3670   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
3671   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
3672   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
3673                                     VP9_ALT_FLAG };
3674   int64_t best_rd = best_rd_so_far;
3675   int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
3676   static const int64_t best_tx_diff[TX_MODES] = { 0 };
3677   int64_t best_pred_diff[REFERENCE_MODES];
3678   int64_t best_pred_rd[REFERENCE_MODES];
3679   int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
3680   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
3681   MB_MODE_INFO best_mbmode = { 0 };
3682   int ref_index, best_ref_index = 0;
3683   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
3684   vp9_prob comp_mode_p;
3685   int64_t best_inter_rd = INT64_MAX;
3686   MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
3687   INTERP_FILTER tmp_best_filter = SWITCHABLE;
3688   int rate_uv_intra, rate_uv_tokenonly;
3689   int64_t dist_uv;
3690   int skip_uv;
3691   PREDICTION_MODE mode_uv = DC_PRED;
3692   int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q);
3693   int_mv seg_mvs[4][MAX_REF_FRAMES];
3694   b_mode_info best_bmodes[4];
3695   int best_skip2 = 0;
3696   int ref_frame_mask = 0;
3697   int mode_skip_mask = 0;
3698
3699   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
3700   vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4);
3701
3702   for (i = 0; i < 4; i++) {
3703     int j;
3704     for (j = 0; j < MAX_REF_FRAMES; j++)
3705       seg_mvs[i][j].as_int = INVALID_MV;
3706   }
3707
3708   estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
3709                            &comp_mode_p);
3710
3711   for (i = 0; i < REFERENCE_MODES; ++i)
3712     best_pred_rd[i] = INT64_MAX;
3713   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
3714     best_filter_rd[i] = INT64_MAX;
3715   rate_uv_intra = INT_MAX;
3716
3717   *returnrate = INT_MAX;
3718
3719   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
3720     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
3721       vp9_setup_buffer_inter(cpi, x, tile,
3722                              ref_frame, bsize, mi_row, mi_col,
3723                              frame_mv[NEARESTMV], frame_mv[NEARMV],
3724                              yv12_mb);
3725     }
3726     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
3727     frame_mv[ZEROMV][ref_frame].as_int = 0;
3728   }
3729
3730   for (ref_frame = LAST_FRAME;
3731        ref_frame <= ALTREF_FRAME && cpi->sf.reference_masking; ++ref_frame) {
3732     int i;
3733     for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
3734       if ((x->pred_mv_sad[ref_frame] >> 1) > x->pred_mv_sad[i]) {
3735         ref_frame_mask |= (1 << ref_frame);
3736         break;
3737       }
3738     }
3739   }
3740
3741   for (ref_index = 0; ref_index < MAX_REFS; ++ref_index) {
3742     int mode_excluded = 0;
3743     int64_t this_rd = INT64_MAX;
3744     int disable_skip = 0;
3745     int compmode_cost = 0;
3746     int rate2 = 0, rate_y = 0, rate_uv = 0;
3747     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
3748     int skippable = 0;
3749     int i;
3750     int this_skip2 = 0;
3751     int64_t total_sse = INT_MAX;
3752     int early_term = 0;
3753
3754     ref_frame = vp9_ref_order[ref_index].ref_frame[0];
3755     second_ref_frame = vp9_ref_order[ref_index].ref_frame[1];
3756
3757     // Look at the reference frame of the best mode so far and set the
3758     // skip mask to look at a subset of the remaining modes.
3759     if (ref_index > 2 && cpi->sf.mode_skip_start < MAX_MODES) {
3760       if (ref_index == 3) {
3761         switch (vp9_ref_order[best_ref_index].ref_frame[0]) {
3762           case INTRA_FRAME:
3763             mode_skip_mask = 0;
3764             break;
3765           case LAST_FRAME:
3766             mode_skip_mask = 0x0010;
3767             break;
3768           case GOLDEN_FRAME:
3769             mode_skip_mask = 0x0008;
3770             break;
3771           case ALTREF_FRAME:
3772             mode_skip_mask = 0x0000;
3773             break;
3774           case NONE:
3775           case MAX_REF_FRAMES:
3776             assert(0 && "Invalid Reference frame");
3777         }
3778       }
3779       if (mode_skip_mask & (1 << ref_index))
3780         continue;
3781     }
3782
3783     // Test best rd so far against threshold for trying this mode.
3784     if (rd_less_than_thresh(best_rd,
3785                             rd_opt->threshes[segment_id][bsize][ref_index],
3786                             rd_opt->thresh_freq_fact[bsize][ref_index]))
3787       continue;
3788
3789     if (ref_frame > INTRA_FRAME &&
3790         !(cpi->ref_frame_flags & flag_list[ref_frame])) {
3791       continue;
3792     }
3793
3794     comp_pred = second_ref_frame > INTRA_FRAME;
3795     if (comp_pred) {
3796       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
3797         continue;
3798       // Do not allow compound prediction if the segment level reference frame
3799       // feature is in use as in this case there can only be one reference.
3800       if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
3801         continue;
3802       if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
3803           vp9_ref_order[best_ref_index].ref_frame[0] == INTRA_FRAME)
3804         continue;
3805       if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH) &&
3806           ref_frame != best_inter_ref_frame &&
3807           second_ref_frame != best_inter_ref_frame)
3808         continue;
3809     }
3810
3811     // TODO(jingning, jkoleszar): scaling reference frame not supported for
3812     // sub8x8 blocks.
3813     if (ref_frame > INTRA_FRAME &&
3814         vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf))
3815       continue;
3816
3817     if (second_ref_frame > INTRA_FRAME &&
3818         vp9_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf))
3819       continue;
3820
3821     if (comp_pred) {
3822       mode_excluded = mode_excluded ? mode_excluded
3823                                     : cm->reference_mode == SINGLE_REFERENCE;
3824     } else if (ref_frame != INTRA_FRAME) {
3825       mode_excluded = mode_excluded ? mode_excluded
3826                                     : cm->reference_mode == COMPOUND_REFERENCE;
3827     }
3828
3829     // If the segment reference frame feature is enabled....
3830     // then do nothing if the current ref frame is not allowed..
3831     if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
3832         vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) !=
3833             (int)ref_frame) {
3834       continue;
3835     // If the segment skip feature is enabled....
3836     // then do nothing if the current mode is not allowed..
3837     } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) &&
3838                ref_frame != INTRA_FRAME) {
3839       continue;
3840     // Disable this drop out case if the ref frame
3841     // segment level feature is enabled for this segment. This is to
3842     // prevent the possibility that we end up unable to pick any mode.
3843     } else if (!vp9_segfeature_active(seg, segment_id,
3844                                       SEG_LVL_REF_FRAME)) {
3845       // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
3846       // unless ARNR filtering is enabled in which case we want
3847       // an unfiltered alternative. We allow near/nearest as well
3848       // because they may result in zero-zero MVs but be cheaper.
3849       if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
3850         continue;
3851     }
3852
3853     mbmi->tx_size = TX_4X4;
3854     mbmi->uv_mode = DC_PRED;
3855     mbmi->ref_frame[0] = ref_frame;
3856     mbmi->ref_frame[1] = second_ref_frame;
3857     // Evaluate all sub-pel filters irrespective of whether we can use
3858     // them for this frame.
3859     mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
3860                                                           : cm->interp_filter;
3861     x->skip = 0;
3862     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
3863
3864     // Select prediction reference frames.
3865     for (i = 0; i < MAX_MB_PLANE; i++) {
3866       xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
3867       if (comp_pred)
3868         xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
3869     }
3870
3871     if (ref_frame == INTRA_FRAME) {
3872       int rate;
3873       if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y,
3874                                        &distortion_y, best_rd) >= best_rd)
3875         continue;
3876       rate2 += rate;
3877       rate2 += intra_cost_penalty;
3878       distortion2 += distortion_y;
3879
3880       if (rate_uv_intra == INT_MAX) {
3881         choose_intra_uv_mode(cpi, ctx, bsize, TX_4X4,
3882                              &rate_uv_intra,
3883                              &rate_uv_tokenonly,
3884                              &dist_uv, &skip_uv,
3885                              &mode_uv);
3886       }
3887       rate2 += rate_uv_intra;
3888       rate_uv = rate_uv_tokenonly;
3889       distortion2 += dist_uv;
3890       distortion_uv = dist_uv;
3891       mbmi->uv_mode = mode_uv;
3892     } else {
3893       int rate;
3894       int64_t distortion;
3895       int64_t this_rd_thresh;
3896       int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
3897       int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
3898       int64_t tmp_best_distortion = INT_MAX, tmp_best_sse, uv_sse;
3899       int tmp_best_skippable = 0;
3900       int switchable_filter_index;
3901       int_mv *second_ref = comp_pred ?
3902                              &mbmi->ref_mvs[second_ref_frame][0] : NULL;
3903       b_mode_info tmp_best_bmodes[16];
3904       MB_MODE_INFO tmp_best_mbmode;
3905       BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
3906       int pred_exists = 0;
3907       int uv_skippable;
3908
3909       this_rd_thresh = (ref_frame == LAST_FRAME) ?
3910           rd_opt->threshes[segment_id][bsize][THR_LAST] :
3911           rd_opt->threshes[segment_id][bsize][THR_ALTR];
3912       this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
3913       rd_opt->threshes[segment_id][bsize][THR_GOLD] : this_rd_thresh;
3914       rd_opt->mask_filter = 0;
3915       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
3916         rd_opt->filter_cache[i] = INT64_MAX;
3917
3918       if (cm->interp_filter != BILINEAR) {
3919         tmp_best_filter = EIGHTTAP;
3920         if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
3921           tmp_best_filter = EIGHTTAP;
3922         } else if (cpi->sf.adaptive_pred_interp_filter == 1 &&
3923                    ctx->pred_interp_filter < SWITCHABLE) {
3924           tmp_best_filter = ctx->pred_interp_filter;
3925         } else if (cpi->sf.adaptive_pred_interp_filter == 2) {
3926           tmp_best_filter = ctx->pred_interp_filter < SWITCHABLE ?
3927                               ctx->pred_interp_filter : 0;
3928         } else {
3929           for (switchable_filter_index = 0;
3930                switchable_filter_index < SWITCHABLE_FILTERS;
3931                ++switchable_filter_index) {
3932             int newbest, rs;
3933             int64_t rs_rd;
3934             mbmi->interp_filter = switchable_filter_index;
3935             tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile,
3936                                               &mbmi->ref_mvs[ref_frame][0],
3937                                               second_ref, best_yrd, &rate,
3938                                               &rate_y, &distortion,
3939                                               &skippable, &total_sse,
3940                                               (int) this_rd_thresh, seg_mvs,
3941                                               bsi, switchable_filter_index,
3942                                               mi_row, mi_col);
3943
3944             if (tmp_rd == INT64_MAX)
3945               continue;
3946             rs = vp9_get_switchable_rate(cpi);
3947             rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
3948             rd_opt->filter_cache[switchable_filter_index] = tmp_rd;
3949             rd_opt->filter_cache[SWITCHABLE_FILTERS] =
3950                 MIN(rd_opt->filter_cache[SWITCHABLE_FILTERS],
3951                     tmp_rd + rs_rd);
3952             if (cm->interp_filter == SWITCHABLE)
3953               tmp_rd += rs_rd;
3954
3955             rd_opt->mask_filter = MAX(rd_opt->mask_filter, tmp_rd);
3956
3957             newbest = (tmp_rd < tmp_best_rd);
3958             if (newbest) {
3959               tmp_best_filter = mbmi->interp_filter;
3960               tmp_best_rd = tmp_rd;
3961             }
3962             if ((newbest && cm->interp_filter == SWITCHABLE) ||
3963                 (mbmi->interp_filter == cm->interp_filter &&
3964                  cm->interp_filter != SWITCHABLE)) {
3965               tmp_best_rdu = tmp_rd;
3966               tmp_best_rate = rate;
3967               tmp_best_ratey = rate_y;
3968               tmp_best_distortion = distortion;
3969               tmp_best_sse = total_sse;
3970               tmp_best_skippable = skippable;
3971               tmp_best_mbmode = *mbmi;
3972               for (i = 0; i < 4; i++) {
3973                 tmp_best_bmodes[i] = xd->mi[0]->bmi[i];
3974                 x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i];
3975               }
3976               pred_exists = 1;
3977               if (switchable_filter_index == 0 &&
3978                   cpi->sf.use_rd_breakout &&
3979                   best_rd < INT64_MAX) {
3980                 if (tmp_best_rdu / 2 > best_rd) {
3981                   // skip searching the other filters if the first is
3982                   // already substantially larger than the best so far
3983                   tmp_best_filter = mbmi->interp_filter;
3984                   tmp_best_rdu = INT64_MAX;
3985                   break;
3986                 }
3987               }
3988             }
3989           }  // switchable_filter_index loop
3990         }
3991       }
3992
3993       if (tmp_best_rdu == INT64_MAX && pred_exists)
3994         continue;
3995
3996       mbmi->interp_filter = (cm->interp_filter == SWITCHABLE ?
3997                              tmp_best_filter : cm->interp_filter);
3998       if (!pred_exists) {
3999         // Handles the special case when a filter that is not in the
4000         // switchable list (bilinear, 6-tap) is indicated at the frame level
4001         tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile,
4002                                           &mbmi->ref_mvs[ref_frame][0],
4003                                           second_ref, best_yrd, &rate, &rate_y,
4004                                           &distortion, &skippable, &total_sse,
4005                                           (int) this_rd_thresh, seg_mvs, bsi, 0,
4006                                           mi_row, mi_col);
4007         if (tmp_rd == INT64_MAX)
4008           continue;
4009       } else {
4010         total_sse = tmp_best_sse;
4011         rate = tmp_best_rate;
4012         rate_y = tmp_best_ratey;
4013         distortion = tmp_best_distortion;
4014         skippable = tmp_best_skippable;
4015         *mbmi = tmp_best_mbmode;
4016         for (i = 0; i < 4; i++)
4017           xd->mi[0]->bmi[i] = tmp_best_bmodes[i];
4018       }
4019
4020       rate2 += rate;
4021       distortion2 += distortion;
4022
4023       if (cm->interp_filter == SWITCHABLE)
4024         rate2 += vp9_get_switchable_rate(cpi);
4025
4026       if (!mode_excluded)
4027         mode_excluded = comp_pred ? cm->reference_mode == SINGLE_REFERENCE
4028                                   : cm->reference_mode == COMPOUND_REFERENCE;
4029
4030       compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
4031
4032       tmp_best_rdu = best_rd -
4033           MIN(RDCOST(x->rdmult, x->rddiv, rate2, distortion2),
4034               RDCOST(x->rdmult, x->rddiv, 0, total_sse));
4035
4036       if (tmp_best_rdu > 0) {
4037         // If even the 'Y' rd value of split is higher than best so far
4038         // then dont bother looking at UV
4039         vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
4040                                         BLOCK_8X8);
4041         super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
4042                          &uv_sse, BLOCK_8X8, tmp_best_rdu);
4043         if (rate_uv == INT_MAX)
4044           continue;
4045         rate2 += rate_uv;
4046         distortion2 += distortion_uv;
4047         skippable = skippable && uv_skippable;
4048         total_sse += uv_sse;
4049       }
4050     }
4051
4052     if (cm->reference_mode == REFERENCE_MODE_SELECT)
4053       rate2 += compmode_cost;
4054
4055     // Estimate the reference frame signaling cost and add it
4056     // to the rolling cost variable.
4057     if (second_ref_frame > INTRA_FRAME) {
4058       rate2 += ref_costs_comp[ref_frame];
4059     } else {
4060       rate2 += ref_costs_single[ref_frame];
4061     }
4062
4063     if (!disable_skip) {
4064       // Test for the condition where skip block will be activated
4065       // because there are no non zero coefficients and make any
4066       // necessary adjustment for rate. Ignore if skip is coded at
4067       // segment level as the cost wont have been added in.
4068       // Is Mb level skip allowed (i.e. not coded at segment level).
4069       const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id,
4070                                                          SEG_LVL_SKIP);
4071
4072       if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) {
4073         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
4074             RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
4075           // Add in the cost of the no skip flag.
4076           rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
4077         } else {
4078           // FIXME(rbultje) make this work for splitmv also
4079           rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
4080           distortion2 = total_sse;
4081           assert(total_sse >= 0);
4082           rate2 -= (rate_y + rate_uv);
4083           rate_y = 0;
4084           rate_uv = 0;
4085           this_skip2 = 1;
4086         }
4087       } else if (mb_skip_allowed) {
4088         // Add in the cost of the no skip flag.
4089         rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
4090       }
4091
4092       // Calculate the final RD estimate for this mode.
4093       this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
4094     }
4095
4096     // Keep record of best inter rd with single reference
4097     if (is_inter_block(mbmi) &&
4098         !has_second_ref(mbmi) &&
4099         !mode_excluded &&
4100         this_rd < best_inter_rd) {
4101       best_inter_rd = this_rd;
4102       best_inter_ref_frame = ref_frame;
4103     }
4104
4105     if (!disable_skip && ref_frame == INTRA_FRAME) {
4106       for (i = 0; i < REFERENCE_MODES; ++i)
4107         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
4108       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
4109         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
4110     }
4111
4112     // Did this mode help.. i.e. is it the new best mode
4113     if (this_rd < best_rd || x->skip) {
4114       if (!mode_excluded) {
4115         int max_plane = MAX_MB_PLANE;
4116         // Note index of best mode so far
4117         best_ref_index = ref_index;
4118
4119         if (ref_frame == INTRA_FRAME) {
4120           /* required for left and above block mv */
4121           mbmi->mv[0].as_int = 0;
4122           max_plane = 1;
4123         }
4124
4125         *returnrate = rate2;
4126         *returndistortion = distortion2;
4127         best_rd = this_rd;
4128         best_yrd = best_rd -
4129                    RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
4130         best_mbmode = *mbmi;
4131         best_skip2 = this_skip2;
4132         if (!x->select_txfm_size)
4133           swap_block_ptr(x, ctx, max_plane);
4134         vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[TX_4X4],
4135                    sizeof(uint8_t) * ctx->num_4x4_blk);
4136
4137         for (i = 0; i < 4; i++)
4138           best_bmodes[i] = xd->mi[0]->bmi[i];
4139
4140         // TODO(debargha): enhance this test with a better distortion prediction
4141         // based on qp, activity mask and history
4142         if ((cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
4143             (ref_index > MIN_EARLY_TERM_INDEX)) {
4144           const int qstep = xd->plane[0].dequant[1];
4145           // TODO(debargha): Enhance this by specializing for each mode_index
4146           int scale = 4;
4147           if (x->source_variance < UINT_MAX) {
4148             const int var_adjust = (x->source_variance < 16);
4149             scale -= var_adjust;
4150           }
4151           if (ref_frame > INTRA_FRAME &&
4152               distortion2 * scale < qstep * qstep) {
4153             early_term = 1;
4154           }
4155         }
4156       }
4157     }
4158
4159     /* keep record of best compound/single-only prediction */
4160     if (!disable_skip && ref_frame != INTRA_FRAME) {
4161       int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
4162
4163       if (cm->reference_mode == REFERENCE_MODE_SELECT) {
4164         single_rate = rate2 - compmode_cost;
4165         hybrid_rate = rate2;
4166       } else {
4167         single_rate = rate2;
4168         hybrid_rate = rate2 + compmode_cost;
4169       }
4170
4171       single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
4172       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
4173
4174       if (!comp_pred && single_rd < best_pred_rd[SINGLE_REFERENCE]) {
4175         best_pred_rd[SINGLE_REFERENCE] = single_rd;
4176       } else if (comp_pred && single_rd < best_pred_rd[COMPOUND_REFERENCE]) {
4177         best_pred_rd[COMPOUND_REFERENCE] = single_rd;
4178       }
4179       if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
4180         best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
4181     }
4182
4183     /* keep record of best filter type */
4184     if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME &&
4185         cm->interp_filter != BILINEAR) {
4186       int64_t ref = rd_opt->filter_cache[cm->interp_filter == SWITCHABLE ?
4187                               SWITCHABLE_FILTERS : cm->interp_filter];
4188       int64_t adj_rd;
4189       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
4190         if (ref == INT64_MAX)
4191           adj_rd = 0;
4192         else if (rd_opt->filter_cache[i] == INT64_MAX)
4193           // when early termination is triggered, the encoder does not have
4194           // access to the rate-distortion cost. it only knows that the cost
4195           // should be above the maximum valid value. hence it takes the known
4196           // maximum plus an arbitrary constant as the rate-distortion cost.
4197           adj_rd = rd_opt->mask_filter - ref + 10;
4198         else
4199           adj_rd = rd_opt->filter_cache[i] - ref;
4200
4201         adj_rd += this_rd;
4202         best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
4203       }
4204     }
4205
4206     if (early_term)
4207       break;
4208
4209     if (x->skip && !comp_pred)
4210       break;
4211   }
4212
4213   if (best_rd >= best_rd_so_far)
4214     return INT64_MAX;
4215
4216   // If we used an estimate for the uv intra rd in the loop above...
4217   if (cpi->sf.use_uv_intra_rd_estimate) {
4218     // Do Intra UV best rd mode selection if best mode choice above was intra.
4219     if (vp9_ref_order[best_ref_index].ref_frame[0] == INTRA_FRAME) {
4220       *mbmi = best_mbmode;
4221       rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra,
4222                               &rate_uv_tokenonly,
4223                               &dist_uv,
4224                               &skip_uv,
4225                               BLOCK_8X8, TX_4X4);
4226     }
4227   }
4228
4229   if (best_rd == INT64_MAX) {
4230     *returnrate = INT_MAX;
4231     *returndistortion = INT64_MAX;
4232     return best_rd;
4233   }
4234
4235   assert((cm->interp_filter == SWITCHABLE) ||
4236          (cm->interp_filter == best_mbmode.interp_filter) ||
4237          !is_inter_block(&best_mbmode));
4238
4239   update_rd_thresh_fact(cpi, bsize, best_ref_index);
4240
4241   // macroblock modes
4242   *mbmi = best_mbmode;
4243   x->skip |= best_skip2;
4244   if (!is_inter_block(&best_mbmode)) {
4245     for (i = 0; i < 4; i++)
4246       xd->mi[0]->bmi[i].as_mode = best_bmodes[i].as_mode;
4247   } else {
4248     for (i = 0; i < 4; ++i)
4249       vpx_memcpy(&xd->mi[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info));
4250
4251     mbmi->mv[0].as_int = xd->mi[0]->bmi[3].as_mv[0].as_int;
4252     mbmi->mv[1].as_int = xd->mi[0]->bmi[3].as_mv[1].as_int;
4253   }
4254
4255   for (i = 0; i < REFERENCE_MODES; ++i) {
4256     if (best_pred_rd[i] == INT64_MAX)
4257       best_pred_diff[i] = INT_MIN;
4258     else
4259       best_pred_diff[i] = best_rd - best_pred_rd[i];
4260   }
4261
4262   if (!x->skip) {
4263     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
4264       if (best_filter_rd[i] == INT64_MAX)
4265         best_filter_diff[i] = 0;
4266       else
4267         best_filter_diff[i] = best_rd - best_filter_rd[i];
4268     }
4269     if (cm->interp_filter == SWITCHABLE)
4270       assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
4271   } else {
4272     vp9_zero(best_filter_diff);
4273   }
4274
4275   set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
4276   store_coding_context(x, ctx, best_ref_index,
4277                        &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
4278                        &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
4279                                       mbmi->ref_frame[1]][0],
4280                        best_pred_diff, best_tx_diff, best_filter_diff);
4281
4282   return best_rd;
4283 }
4284
4285 void vp9_set_rd_speed_thresholds(VP9_COMP *cpi) {
4286   int i;
4287   RD_OPT *const rd = &cpi->rd;
4288
4289   // Set baseline threshold values
4290   for (i = 0; i < MAX_MODES; ++i)
4291     rd->thresh_mult[i] = is_best_mode(cpi->oxcf.mode) ? -500 : 0;
4292
4293   rd->thresh_mult[THR_NEARESTMV] = 0;
4294   rd->thresh_mult[THR_NEARESTG] = 0;
4295   rd->thresh_mult[THR_NEARESTA] = 0;
4296
4297   rd->thresh_mult[THR_DC] += 1000;
4298
4299   rd->thresh_mult[THR_NEWMV] += 1000;
4300   rd->thresh_mult[THR_NEWA] += 1000;
4301   rd->thresh_mult[THR_NEWG] += 1000;
4302
4303   rd->thresh_mult[THR_NEARMV] += 1000;
4304   rd->thresh_mult[THR_NEARA] += 1000;
4305   rd->thresh_mult[THR_COMP_NEARESTLA] += 1000;
4306   rd->thresh_mult[THR_COMP_NEARESTGA] += 1000;
4307
4308   rd->thresh_mult[THR_TM] += 1000;
4309
4310   rd->thresh_mult[THR_COMP_NEARLA] += 1500;
4311   rd->thresh_mult[THR_COMP_NEWLA] += 2000;
4312   rd->thresh_mult[THR_NEARG] += 1000;
4313   rd->thresh_mult[THR_COMP_NEARGA] += 1500;
4314   rd->thresh_mult[THR_COMP_NEWGA] += 2000;
4315
4316   rd->thresh_mult[THR_ZEROMV] += 2000;
4317   rd->thresh_mult[THR_ZEROG] += 2000;
4318   rd->thresh_mult[THR_ZEROA] += 2000;
4319   rd->thresh_mult[THR_COMP_ZEROLA] += 2500;
4320   rd->thresh_mult[THR_COMP_ZEROGA] += 2500;
4321
4322   rd->thresh_mult[THR_H_PRED] += 2000;
4323   rd->thresh_mult[THR_V_PRED] += 2000;
4324   rd->thresh_mult[THR_D45_PRED ] += 2500;
4325   rd->thresh_mult[THR_D135_PRED] += 2500;
4326   rd->thresh_mult[THR_D117_PRED] += 2500;
4327   rd->thresh_mult[THR_D153_PRED] += 2500;
4328   rd->thresh_mult[THR_D207_PRED] += 2500;
4329   rd->thresh_mult[THR_D63_PRED] += 2500;
4330
4331   /* disable frame modes if flags not set */
4332   if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) {
4333     rd->thresh_mult[THR_NEWMV    ] = INT_MAX;
4334     rd->thresh_mult[THR_NEARESTMV] = INT_MAX;
4335     rd->thresh_mult[THR_ZEROMV   ] = INT_MAX;
4336     rd->thresh_mult[THR_NEARMV   ] = INT_MAX;
4337   }
4338   if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) {
4339     rd->thresh_mult[THR_NEARESTG ] = INT_MAX;
4340     rd->thresh_mult[THR_ZEROG    ] = INT_MAX;
4341     rd->thresh_mult[THR_NEARG    ] = INT_MAX;
4342     rd->thresh_mult[THR_NEWG     ] = INT_MAX;
4343   }
4344   if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) {
4345     rd->thresh_mult[THR_NEARESTA ] = INT_MAX;
4346     rd->thresh_mult[THR_ZEROA    ] = INT_MAX;
4347     rd->thresh_mult[THR_NEARA    ] = INT_MAX;
4348     rd->thresh_mult[THR_NEWA     ] = INT_MAX;
4349   }
4350
4351   if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) !=
4352       (VP9_LAST_FLAG | VP9_ALT_FLAG)) {
4353     rd->thresh_mult[THR_COMP_ZEROLA   ] = INT_MAX;
4354     rd->thresh_mult[THR_COMP_NEARESTLA] = INT_MAX;
4355     rd->thresh_mult[THR_COMP_NEARLA   ] = INT_MAX;
4356     rd->thresh_mult[THR_COMP_NEWLA    ] = INT_MAX;
4357   }
4358   if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) !=
4359       (VP9_GOLD_FLAG | VP9_ALT_FLAG)) {
4360     rd->thresh_mult[THR_COMP_ZEROGA   ] = INT_MAX;
4361     rd->thresh_mult[THR_COMP_NEARESTGA] = INT_MAX;
4362     rd->thresh_mult[THR_COMP_NEARGA   ] = INT_MAX;
4363     rd->thresh_mult[THR_COMP_NEWGA    ] = INT_MAX;
4364   }
4365 }
4366
4367 void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) {
4368   const SPEED_FEATURES *const sf = &cpi->sf;
4369   RD_OPT *const rd = &cpi->rd;
4370   int i;
4371
4372   for (i = 0; i < MAX_REFS; ++i)
4373     rd->thresh_mult_sub8x8[i] = is_best_mode(cpi->oxcf.mode)  ? -500 : 0;
4374
4375   rd->thresh_mult_sub8x8[THR_LAST] += 2500;
4376   rd->thresh_mult_sub8x8[THR_GOLD] += 2500;
4377   rd->thresh_mult_sub8x8[THR_ALTR] += 2500;
4378   rd->thresh_mult_sub8x8[THR_INTRA] += 2500;
4379   rd->thresh_mult_sub8x8[THR_COMP_LA] += 4500;
4380   rd->thresh_mult_sub8x8[THR_COMP_GA] += 4500;
4381
4382   // Check for masked out split cases.
4383   for (i = 0; i < MAX_REFS; i++)
4384     if (sf->disable_split_mask & (1 << i))
4385       rd->thresh_mult_sub8x8[i] = INT_MAX;
4386
4387   // disable mode test if frame flag is not set
4388   if (!(cpi->ref_frame_flags & VP9_LAST_FLAG))
4389     rd->thresh_mult_sub8x8[THR_LAST] = INT_MAX;
4390   if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG))
4391     rd->thresh_mult_sub8x8[THR_GOLD] = INT_MAX;
4392   if (!(cpi->ref_frame_flags & VP9_ALT_FLAG))
4393     rd->thresh_mult_sub8x8[THR_ALTR] = INT_MAX;
4394   if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) !=
4395       (VP9_LAST_FLAG | VP9_ALT_FLAG))
4396     rd->thresh_mult_sub8x8[THR_COMP_LA] = INT_MAX;
4397   if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) !=
4398       (VP9_GOLD_FLAG | VP9_ALT_FLAG))
4399     rd->thresh_mult_sub8x8[THR_COMP_GA] = INT_MAX;
4400 }