src/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.c

   1 /*
   2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include <stdio.h>
  12 #include <math.h>
  13 #include <limits.h>
  14 #include <assert.h>
  15
  16 #include "vp9/common/vp9_pragmas.h"
  17 #include "vp9/encoder/vp9_tokenize.h"
  18 #include "vp9/encoder/vp9_treewriter.h"
  19 #include "vp9/encoder/vp9_onyx_int.h"
  20 #include "vp9/common/vp9_entropymode.h"
  21 #include "vp9/common/vp9_reconinter.h"
  22 #include "vp9/common/vp9_reconintra.h"
  23 #include "vp9/common/vp9_quant_common.h"
  24 #include "vp9/encoder/vp9_encodemb.h"
  25 #include "vp9/encoder/vp9_quantize.h"
  26 #include "vp9/encoder/vp9_variance.h"
  27 #include "vp9/encoder/vp9_mcomp.h"
  28 #include "vp9/encoder/vp9_rdopt.h"
  29 #include "vp9/encoder/vp9_ratectrl.h"
  30 #include "vpx_mem/vpx_mem.h"
  31 #include "vp9/common/vp9_systemdependent.h"
  32 #include "vp9/encoder/vp9_encodemv.h"
  33 #include "vp9/common/vp9_seg_common.h"
  34 #include "vp9/common/vp9_pred_common.h"
  35 #include "vp9/common/vp9_entropy.h"
  36 #include "./vp9_rtcd.h"
  37 #include "vp9/common/vp9_mvref_common.h"
  38 #include "vp9/common/vp9_common.h"
  39
  40 /* Factor to weigh the rate for switchable interp filters */
  41 #define SWITCHABLE_INTERP_RATE_FACTOR 1
  42
  43 #define LAST_FRAME_MODE_MASK    0xFFEDCD60
  44 #define GOLDEN_FRAME_MODE_MASK  0xFFDA3BB0
  45 #define ALT_REF_MODE_MASK       0xFFC648D0
  46
  47 #define MIN_EARLY_TERM_INDEX    3
  48
  49 typedef struct {
  50   MB_PREDICTION_MODE mode;
  51   MV_REFERENCE_FRAME ref_frame[2];
  52 } MODE_DEFINITION;
  53
  54 typedef struct {
  55   MV_REFERENCE_FRAME ref_frame[2];
  56 } REF_DEFINITION;
  57
  58 struct rdcost_block_args {
  59   MACROBLOCK *x;
  60   ENTROPY_CONTEXT t_above[16];
  61   ENTROPY_CONTEXT t_left[16];
  62   int rate;
  63   int64_t dist;
  64   int64_t sse;
  65   int this_rate;
  66   int64_t this_dist;
  67   int64_t this_sse;
  68   int64_t this_rd;
  69   int64_t best_rd;
  70   int skip;
  71   const scan_order *so;
  72 };
  73
  74 const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
  75   {NEARESTMV, {LAST_FRAME,   NONE}},
  76   {NEARESTMV, {ALTREF_FRAME, NONE}},
  77   {NEARESTMV, {GOLDEN_FRAME, NONE}},
  78
  79   {DC_PRED,   {INTRA_FRAME,  NONE}},
  80
  81   {NEWMV,     {LAST_FRAME,   NONE}},
  82   {NEWMV,     {ALTREF_FRAME, NONE}},
  83   {NEWMV,     {GOLDEN_FRAME, NONE}},
  84
  85   {NEARMV,    {LAST_FRAME,   NONE}},
  86   {NEARMV,    {ALTREF_FRAME, NONE}},
  87   {NEARESTMV, {LAST_FRAME,   ALTREF_FRAME}},
  88   {NEARESTMV, {GOLDEN_FRAME, ALTREF_FRAME}},
  89
  90   {TM_PRED,   {INTRA_FRAME,  NONE}},
  91
  92   {NEARMV,    {LAST_FRAME,   ALTREF_FRAME}},
  93   {NEWMV,     {LAST_FRAME,   ALTREF_FRAME}},
  94   {NEARMV,    {GOLDEN_FRAME, NONE}},
  95   {NEARMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
  96   {NEWMV,     {GOLDEN_FRAME, ALTREF_FRAME}},
  97
  98   {ZEROMV,    {LAST_FRAME,   NONE}},
  99   {ZEROMV,    {GOLDEN_FRAME, NONE}},
 100   {ZEROMV,    {ALTREF_FRAME, NONE}},
 101   {ZEROMV,    {LAST_FRAME,   ALTREF_FRAME}},
 102   {ZEROMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
 103
 104   {H_PRED,    {INTRA_FRAME,  NONE}},
 105   {V_PRED,    {INTRA_FRAME,  NONE}},
 106   {D135_PRED, {INTRA_FRAME,  NONE}},
 107   {D207_PRED, {INTRA_FRAME,  NONE}},
 108   {D153_PRED, {INTRA_FRAME,  NONE}},
 109   {D63_PRED,  {INTRA_FRAME,  NONE}},
 110   {D117_PRED, {INTRA_FRAME,  NONE}},
 111   {D45_PRED,  {INTRA_FRAME,  NONE}},
 112 };
 113
 114 const REF_DEFINITION vp9_ref_order[MAX_REFS] = {
 115   {{LAST_FRAME,   NONE}},
 116   {{GOLDEN_FRAME, NONE}},
 117   {{ALTREF_FRAME, NONE}},
 118   {{LAST_FRAME,   ALTREF_FRAME}},
 119   {{GOLDEN_FRAME, ALTREF_FRAME}},
 120   {{INTRA_FRAME,  NONE}},
 121 };
 122
 123 // The baseline rd thresholds for breaking out of the rd loop for
 124 // certain modes are assumed to be based on 8x8 blocks.
 125 // This table is used to correct for blocks size.
 126 // The factors here are << 2 (2 = x0.5, 32 = x8 etc).
 127 static int rd_thresh_block_size_factor[BLOCK_SIZES] =
 128   {2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32};
 129
 130 static int raster_block_offset(BLOCK_SIZE plane_bsize,
 131                                int raster_block, int stride) {
 132   const int bw = b_width_log2(plane_bsize);
 133   const int y = 4 * (raster_block >> bw);
 134   const int x = 4 * (raster_block & ((1 << bw) - 1));
 135   return y * stride + x;
 136 }
 137 static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize,
 138                                           int raster_block, int16_t *base) {
 139   const int stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
 140   return base + raster_block_offset(plane_bsize, raster_block, stride);
 141 }
 142
 143 static void fill_mode_costs(VP9_COMP *cpi) {
 144   VP9_COMMON *const cm = &cpi->common;
 145   MACROBLOCK *const x = &cpi->mb;
 146   FRAME_CONTEXT *const fc = &cm->fc;
 147   int i, j;
 148
 149   for (i = 0; i < INTRA_MODES; i++)
 150     for (j = 0; j < INTRA_MODES; j++)
 151       vp9_cost_tokens((int *)x->y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
 152                       vp9_intra_mode_tree);
 153
 154   // TODO(rbultje) separate tables for superblock costing?
 155   vp9_cost_tokens(x->mbmode_cost, fc->y_mode_prob[1], vp9_intra_mode_tree);
 156   vp9_cost_tokens(x->intra_uv_mode_cost[1],
 157                   fc->uv_mode_prob[INTRA_MODES - 1], vp9_intra_mode_tree);
 158   vp9_cost_tokens(x->intra_uv_mode_cost[0],
 159                   vp9_kf_uv_mode_prob[INTRA_MODES - 1], vp9_intra_mode_tree);
 160
 161   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
 162     vp9_cost_tokens((int *)x->switchable_interp_costs[i],
 163                     fc->switchable_interp_prob[i],
 164                     vp9_switchable_interp_tree);
 165 }
 166
 167 static void fill_token_costs(vp9_coeff_cost *c,
 168                              vp9_coeff_probs_model (*p)[PLANE_TYPES]) {
 169   int i, j, k, l;
 170   TX_SIZE t;
 171   for (t = TX_4X4; t <= TX_32X32; ++t)
 172     for (i = 0; i < PLANE_TYPES; ++i)
 173       for (j = 0; j < REF_TYPES; ++j)
 174         for (k = 0; k < COEF_BANDS; ++k)
 175           for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
 176             vp9_prob probs[ENTROPY_NODES];
 177             vp9_model_to_full_probs(p[t][i][j][k][l], probs);
 178             vp9_cost_tokens((int *)c[t][i][j][k][0][l], probs,
 179                             vp9_coef_tree);
 180             vp9_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs,
 181                                  vp9_coef_tree);
 182             assert(c[t][i][j][k][0][l][EOB_TOKEN] ==
 183                    c[t][i][j][k][1][l][EOB_TOKEN]);
 184           }
 185 }
 186
 187 static const int rd_iifactor[32] = {
 188   4, 4, 3, 2, 1, 0, 0, 0,
 189   0, 0, 0, 0, 0, 0, 0, 0,
 190   0, 0, 0, 0, 0, 0, 0, 0,
 191   0, 0, 0, 0, 0, 0, 0, 0,
 192 };
 193
 194 // 3* dc_qlookup[Q]*dc_qlookup[Q];
 195
 196 /* values are now correlated to quantizer */
 197 static int sad_per_bit16lut[QINDEX_RANGE];
 198 static int sad_per_bit4lut[QINDEX_RANGE];
 199
 200 void vp9_init_me_luts() {
 201   int i;
 202
 203   // Initialize the sad lut tables using a formulaic calculation for now
 204   // This is to make it easier to resolve the impact of experimental changes
 205   // to the quantizer tables.
 206   for (i = 0; i < QINDEX_RANGE; i++) {
 207     const double q = vp9_convert_qindex_to_q(i);
 208     sad_per_bit16lut[i] = (int)(0.0418 * q + 2.4107);
 209     sad_per_bit4lut[i] = (int)(0.063 * q + 2.742);
 210   }
 211 }
 212
 213 int vp9_compute_rd_mult(VP9_COMP *cpi, int qindex) {
 214   const int q = vp9_dc_quant(qindex, 0);
 215   // TODO(debargha): Adjust the function below
 216   int rdmult = 88 * q * q / 25;
 217   if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
 218     if (cpi->twopass.next_iiratio > 31)
 219       rdmult += (rdmult * rd_iifactor[31]) >> 4;
 220     else
 221       rdmult += (rdmult * rd_iifactor[cpi->twopass.next_iiratio]) >> 4;
 222   }
 223   return rdmult;
 224 }
 225
 226 static int compute_rd_thresh_factor(int qindex) {
 227   int q;
 228   // TODO(debargha): Adjust the function below
 229   q = (int)(pow(vp9_dc_quant(qindex, 0) / 4.0, RD_THRESH_POW) * 5.12);
 230   if (q < 8)
 231     q = 8;
 232   return q;
 233 }
 234
 235 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
 236   cpi->mb.sadperbit16 = sad_per_bit16lut[qindex];
 237   cpi->mb.sadperbit4 = sad_per_bit4lut[qindex];
 238 }
 239
 240 static void set_block_thresholds(VP9_COMP *cpi) {
 241   int i, bsize, segment_id;
 242   VP9_COMMON *cm = &cpi->common;
 243   SPEED_FEATURES *sf = &cpi->sf;
 244
 245   for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
 246     const int qindex = clamp(vp9_get_qindex(&cm->seg, segment_id,
 247                                             cm->base_qindex) + cm->y_dc_delta_q,
 248                              0, MAXQ);
 249     const int q = compute_rd_thresh_factor(qindex);
 250
 251     for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
 252       // Threshold here seems unnecessarily harsh but fine given actual
 253       // range of values used for cpi->sf.thresh_mult[].
 254       const int t = q * rd_thresh_block_size_factor[bsize];
 255       const int thresh_max = INT_MAX / t;
 256
 257       for (i = 0; i < MAX_MODES; ++i)
 258         cpi->rd_threshes[segment_id][bsize][i] =
 259             sf->thresh_mult[i] < thresh_max ? sf->thresh_mult[i] * t / 4
 260                                             : INT_MAX;
 261
 262       for (i = 0; i < MAX_REFS; ++i) {
 263         cpi->rd_thresh_sub8x8[segment_id][bsize][i] =
 264             sf->thresh_mult_sub8x8[i] < thresh_max
 265                 ? sf->thresh_mult_sub8x8[i] * t / 4
 266                 : INT_MAX;
 267       }
 268     }
 269   }
 270 }
 271
 272 void vp9_initialize_rd_consts(VP9_COMP *cpi) {
 273   VP9_COMMON *cm = &cpi->common;
 274   MACROBLOCK *x = &cpi->mb;
 275   int qindex, i;
 276
 277   vp9_clear_system_state();  // __asm emms;
 278
 279   // Further tests required to see if optimum is different
 280   // for key frames, golden frames and arf frames.
 281   // if (cpi->common.refresh_golden_frame ||
 282   //     cpi->common.refresh_alt_ref_frame)
 283   qindex = clamp(cm->base_qindex + cm->y_dc_delta_q, 0, MAXQ);
 284
 285   cpi->RDDIV = RDDIV_BITS;  // in bits (to multiply D by 128)
 286   cpi->RDMULT = vp9_compute_rd_mult(cpi, qindex);
 287
 288   x->errorperbit = cpi->RDMULT / RD_MULT_EPB_RATIO;
 289   x->errorperbit += (x->errorperbit == 0);
 290
 291   vp9_set_speed_features(cpi);
 292
 293   x->select_txfm_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
 294                          cm->frame_type != KEY_FRAME) ? 0 : 1;
 295
 296   set_block_thresholds(cpi);
 297
 298   fill_token_costs(x->token_costs, cm->fc.coef_probs);
 299
 300   if (!cpi->sf.use_pick_mode) {
 301     for (i = 0; i < PARTITION_CONTEXTS; i++)
 302       vp9_cost_tokens(x->partition_cost[i], get_partition_probs(cm, i),
 303                       vp9_partition_tree);
 304
 305     fill_mode_costs(cpi);
 306
 307     if (!frame_is_intra_only(cm)) {
 308       vp9_build_nmv_cost_table(x->nmvjointcost,
 309                                cm->allow_high_precision_mv ? x->nmvcost_hp
 310                                                            : x->nmvcost,
 311                                &cm->fc.nmvc,
 312                                cm->allow_high_precision_mv, 1, 1);
 313
 314       for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
 315         vp9_cost_tokens((int *)x->inter_mode_cost[i],
 316                         cm->fc.inter_mode_probs[i], vp9_inter_mode_tree);
 317     }
 318   }
 319 }
 320
 321 static const int MAX_XSQ_Q10 = 245727;
 322
 323 static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
 324   // NOTE: The tables below must be of the same size
 325
 326   // The functions described below are sampled at the four most significant
 327   // bits of x^2 + 8 / 256
 328
 329   // Normalized rate
 330   // This table models the rate for a Laplacian source
 331   // source with given variance when quantized with a uniform quantizer
 332   // with given stepsize. The closed form expression is:
 333   // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
 334   // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
 335   // and H(x) is the binary entropy function.
 336   static const int rate_tab_q10[] = {
 337     65536,  6086,  5574,  5275,  5063,  4899,  4764,  4651,
 338      4553,  4389,  4255,  4142,  4044,  3958,  3881,  3811,
 339      3748,  3635,  3538,  3453,  3376,  3307,  3244,  3186,
 340      3133,  3037,  2952,  2877,  2809,  2747,  2690,  2638,
 341      2589,  2501,  2423,  2353,  2290,  2232,  2179,  2130,
 342      2084,  2001,  1928,  1862,  1802,  1748,  1698,  1651,
 343      1608,  1530,  1460,  1398,  1342,  1290,  1243,  1199,
 344      1159,  1086,  1021,   963,   911,   864,   821,   781,
 345       745,   680,   623,   574,   530,   490,   455,   424,
 346       395,   345,   304,   269,   239,   213,   190,   171,
 347       154,   126,   104,    87,    73,    61,    52,    44,
 348        38,    28,    21,    16,    12,    10,     8,     6,
 349         5,     3,     2,     1,     1,     1,     0,     0,
 350   };
 351   // Normalized distortion
 352   // This table models the normalized distortion for a Laplacian source
 353   // source with given variance when quantized with a uniform quantizer
 354   // with given stepsize. The closed form expression is:
 355   // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
 356   // where x = qpstep / sqrt(variance)
 357   // Note the actual distortion is Dn * variance.
 358   static const int dist_tab_q10[] = {
 359        0,     0,     1,     1,     1,     2,     2,     2,
 360        3,     3,     4,     5,     5,     6,     7,     7,
 361        8,     9,    11,    12,    13,    15,    16,    17,
 362       18,    21,    24,    26,    29,    31,    34,    36,
 363       39,    44,    49,    54,    59,    64,    69,    73,
 364       78,    88,    97,   106,   115,   124,   133,   142,
 365      151,   167,   184,   200,   215,   231,   245,   260,
 366      274,   301,   327,   351,   375,   397,   418,   439,
 367      458,   495,   528,   559,   587,   613,   637,   659,
 368      680,   717,   749,   777,   801,   823,   842,   859,
 369      874,   899,   919,   936,   949,   960,   969,   977,
 370      983,   994,  1001,  1006,  1010,  1013,  1015,  1017,
 371     1018,  1020,  1022,  1022,  1023,  1023,  1023,  1024,
 372   };
 373   static const int xsq_iq_q10[] = {
 374          0,      4,      8,     12,     16,     20,     24,     28,
 375         32,     40,     48,     56,     64,     72,     80,     88,
 376         96,    112,    128,    144,    160,    176,    192,    208,
 377        224,    256,    288,    320,    352,    384,    416,    448,
 378        480,    544,    608,    672,    736,    800,    864,    928,
 379        992,   1120,   1248,   1376,   1504,   1632,   1760,   1888,
 380       2016,   2272,   2528,   2784,   3040,   3296,   3552,   3808,
 381       4064,   4576,   5088,   5600,   6112,   6624,   7136,   7648,
 382       8160,   9184,  10208,  11232,  12256,  13280,  14304,  15328,
 383      16352,  18400,  20448,  22496,  24544,  26592,  28640,  30688,
 384      32736,  36832,  40928,  45024,  49120,  53216,  57312,  61408,
 385      65504,  73696,  81888,  90080,  98272, 106464, 114656, 122848,
 386     131040, 147424, 163808, 180192, 196576, 212960, 229344, 245728,
 387   };
 388   /*
 389   static const int tab_size = sizeof(rate_tab_q10) / sizeof(rate_tab_q10[0]);
 390   assert(sizeof(dist_tab_q10) / sizeof(dist_tab_q10[0]) == tab_size);
 391   assert(sizeof(xsq_iq_q10) / sizeof(xsq_iq_q10[0]) == tab_size);
 392   assert(MAX_XSQ_Q10 + 1 == xsq_iq_q10[tab_size - 1]);
 393   */
 394   int tmp = (xsq_q10 >> 2) + 8;
 395   int k = get_msb(tmp) - 3;
 396   int xq = (k << 3) + ((tmp >> k) & 0x7);
 397   const int one_q10 = 1 << 10;
 398   const int a_q10 = ((xsq_q10 - xsq_iq_q10[xq]) << 10) >> (2 + k);
 399   const int b_q10 = one_q10 - a_q10;
 400   *r_q10 = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
 401   *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
 402 }
 403
 404 static void model_rd_from_var_lapndz(unsigned int var, unsigned int n,
 405                                      unsigned int qstep, int *rate,
 406                                      int64_t *dist) {
 407   // This function models the rate and distortion for a Laplacian
 408   // source with given variance when quantized with a uniform quantizer
 409   // with given stepsize. The closed form expressions are in:
 410   // Hang and Chen, "Source Model for transform video coder and its
 411   // application - Part I: Fundamental Theory", IEEE Trans. Circ.
 412   // Sys. for Video Tech., April 1997.
 413   if (var == 0) {
 414     *rate = 0;
 415     *dist = 0;
 416   } else {
 417     int d_q10, r_q10;
 418     uint64_t xsq_q10_64 =
 419         ((((uint64_t)qstep * qstep * n) << 10) + (var >> 1)) / var;
 420     int xsq_q10 = xsq_q10_64 > MAX_XSQ_Q10 ? MAX_XSQ_Q10 : xsq_q10_64;
 421     model_rd_norm(xsq_q10, &r_q10, &d_q10);
 422     *rate = (n * r_q10 + 2) >> 2;
 423     *dist = (var * (int64_t)d_q10 + 512) >> 10;
 424   }
 425 }
 426
 427 static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
 428                             MACROBLOCK *x, MACROBLOCKD *xd,
 429                             int *out_rate_sum, int64_t *out_dist_sum) {
 430   // Note our transform coeffs are 8 times an orthogonal transform.
 431   // Hence quantizer step is also 8 times. To get effective quantizer
 432   // we need to divide by 8 before sending to modeling function.
 433   int i, rate_sum = 0, dist_sum = 0;
 434   int ref = xd->mi_8x8[0]->mbmi.ref_frame[0];
 435   unsigned int sse;
 436
 437   for (i = 0; i < MAX_MB_PLANE; ++i) {
 438     struct macroblock_plane *const p = &x->plane[i];
 439     struct macroblockd_plane *const pd = &xd->plane[i];
 440     const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
 441
 442     (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
 443                               pd->dst.buf, pd->dst.stride, &sse);
 444
 445     if (i == 0)
 446       x->pred_sse[ref] = sse;
 447     if (cpi->sf.use_pick_mode) {
 448       dist_sum += (int)sse;
 449     } else {
 450       int rate;
 451       int64_t dist;
 452       model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs],
 453                                pd->dequant[1] >> 3, &rate, &dist);
 454       rate_sum += rate;
 455       dist_sum += (int)dist;
 456     }
 457   }
 458
 459   *out_rate_sum = rate_sum;
 460   *out_dist_sum = (int64_t)dist_sum << 4;
 461 }
 462
 463 static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE bsize,
 464                                  TX_SIZE tx_size,
 465                                  MACROBLOCK *x, MACROBLOCKD *xd,
 466                                  int *out_rate_sum, int64_t *out_dist_sum,
 467                                  int *out_skip) {
 468   int j, k;
 469   BLOCK_SIZE bs;
 470   struct macroblock_plane *const p = &x->plane[0];
 471   struct macroblockd_plane *const pd = &xd->plane[0];
 472   const int width = 4 * num_4x4_blocks_wide_lookup[bsize];
 473   const int height = 4 * num_4x4_blocks_high_lookup[bsize];
 474   int rate_sum = 0;
 475   int64_t dist_sum = 0;
 476   const int t = 4 << tx_size;
 477
 478   if (tx_size == TX_4X4) {
 479     bs = BLOCK_4X4;
 480   } else if (tx_size == TX_8X8) {
 481     bs = BLOCK_8X8;
 482   } else if (tx_size == TX_16X16) {
 483     bs = BLOCK_16X16;
 484   } else if (tx_size == TX_32X32) {
 485     bs = BLOCK_32X32;
 486   } else {
 487     assert(0);
 488   }
 489
 490   *out_skip = 1;
 491   for (j = 0; j < height; j += t) {
 492     for (k = 0; k < width; k += t) {
 493       int rate;
 494       int64_t dist;
 495       unsigned int sse;
 496       cpi->fn_ptr[bs].vf(&p->src.buf[j * p->src.stride + k], p->src.stride,
 497                          &pd->dst.buf[j * pd->dst.stride + k], pd->dst.stride,
 498                          &sse);
 499       // sse works better than var, since there is no dc prediction used
 500       model_rd_from_var_lapndz(sse, t * t, pd->dequant[1] >> 3, &rate, &dist);
 501       rate_sum += rate;
 502       dist_sum += dist;
 503       *out_skip &= (rate < 1024);
 504     }
 505   }
 506
 507   *out_rate_sum = rate_sum;
 508   *out_dist_sum = dist_sum << 4;
 509 }
 510
 511 int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff,
 512                           intptr_t block_size, int64_t *ssz) {
 513   int i;
 514   int64_t error = 0, sqcoeff = 0;
 515
 516   for (i = 0; i < block_size; i++) {
 517     int this_diff = coeff[i] - dqcoeff[i];
 518     error += (unsigned)this_diff * this_diff;
 519     sqcoeff += (unsigned) coeff[i] * coeff[i];
 520   }
 521
 522   *ssz = sqcoeff;
 523   return error;
 524 }
 525
 526 /* The trailing '0' is a terminator which is used inside cost_coeffs() to
 527  * decide whether to include cost of a trailing EOB node or not (i.e. we
 528  * can skip this if the last coefficient in this transform block, e.g. the
 529  * 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block,
 530  * were non-zero). */
 531 static const int16_t band_counts[TX_SIZES][8] = {
 532   { 1, 2, 3, 4,  3,   16 - 13, 0 },
 533   { 1, 2, 3, 4, 11,   64 - 21, 0 },
 534   { 1, 2, 3, 4, 11,  256 - 21, 0 },
 535   { 1, 2, 3, 4, 11, 1024 - 21, 0 },
 536 };
 537
 538 static INLINE int cost_coeffs(MACROBLOCK *x,
 539                               int plane, int block,
 540                               ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
 541                               TX_SIZE tx_size,
 542                               const int16_t *scan, const int16_t *nb) {
 543   MACROBLOCKD *const xd = &x->e_mbd;
 544   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
 545   struct macroblock_plane *p = &x->plane[plane];
 546   struct macroblockd_plane *pd = &xd->plane[plane];
 547   const PLANE_TYPE type = pd->plane_type;
 548   const int16_t *band_count = &band_counts[tx_size][1];
 549   const int eob = p->eobs[block];
 550   const int16_t *const qcoeff_ptr = BLOCK_OFFSET(p->qcoeff, block);
 551   const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
 552   unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
 553                    x->token_costs[tx_size][type][ref];
 554   const ENTROPY_CONTEXT above_ec = !!*A, left_ec = !!*L;
 555   uint8_t *p_tok = x->token_cache;
 556   int pt = combine_entropy_contexts(above_ec, left_ec);
 557   int c, cost;
 558
 559   // Check for consistency of tx_size with mode info
 560   assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size
 561                                       : get_uv_tx_size(mbmi) == tx_size);
 562
 563   if (eob == 0) {
 564     // single eob token
 565     cost = token_costs[0][0][pt][EOB_TOKEN];
 566     c = 0;
 567   } else {
 568     int band_left = *band_count++;
 569
 570     // dc token
 571     int v = qcoeff_ptr[0];
 572     int prev_t = vp9_dct_value_tokens_ptr[v].token;
 573     cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
 574     p_tok[0] = vp9_pt_energy_class[prev_t];
 575     ++token_costs;
 576
 577     // ac tokens
 578     for (c = 1; c < eob; c++) {
 579       const int rc = scan[c];
 580       int t;
 581
 582       v = qcoeff_ptr[rc];
 583       t = vp9_dct_value_tokens_ptr[v].token;
 584       pt = get_coef_context(nb, p_tok, c);
 585       cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v];
 586       p_tok[rc] = vp9_pt_energy_class[t];
 587       prev_t = t;
 588       if (!--band_left) {
 589         band_left = *band_count++;
 590         ++token_costs;
 591       }
 592     }
 593
 594     // eob token
 595     if (band_left) {
 596       pt = get_coef_context(nb, p_tok, c);
 597       cost += (*token_costs)[0][pt][EOB_TOKEN];
 598     }
 599   }
 600
 601   // is eob first coefficient;
 602   *A = *L = (c > 0);
 603
 604   return cost;
 605 }
 606
 607 static void dist_block(int plane, int block, TX_SIZE tx_size,
 608                        struct rdcost_block_args* args) {
 609   const int ss_txfrm_size = tx_size << 1;
 610   MACROBLOCK* const x = args->x;
 611   MACROBLOCKD* const xd = &x->e_mbd;
 612   struct macroblock_plane *const p = &x->plane[plane];
 613   struct macroblockd_plane *const pd = &xd->plane[plane];
 614   int64_t this_sse;
 615   int shift = tx_size == TX_32X32 ? 0 : 2;
 616   int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
 617   int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
 618   args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
 619                                &this_sse) >> shift;
 620   args->sse  = this_sse >> shift;
 621
 622   if (x->skip_encode && !is_inter_block(&xd->mi_8x8[0]->mbmi)) {
 623     // TODO(jingning): tune the model to better capture the distortion.
 624     int64_t p = (pd->dequant[1] * pd->dequant[1] *
 625                     (1 << ss_txfrm_size)) >> (shift + 2);
 626     args->dist += (p >> 4);
 627     args->sse  += p;
 628   }
 629 }
 630
 631 static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
 632                        TX_SIZE tx_size, struct rdcost_block_args* args) {
 633   int x_idx, y_idx;
 634   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x_idx, &y_idx);
 635
 636   args->rate = cost_coeffs(args->x, plane, block, args->t_above + x_idx,
 637                            args->t_left + y_idx, tx_size,
 638                            args->so->scan, args->so->neighbors);
 639 }
 640
 641 static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
 642                           TX_SIZE tx_size, void *arg) {
 643   struct rdcost_block_args *args = arg;
 644   MACROBLOCK *const x = args->x;
 645   MACROBLOCKD *const xd = &x->e_mbd;
 646   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
 647   int64_t rd1, rd2, rd;
 648
 649   if (args->skip)
 650     return;
 651
 652   if (!is_inter_block(mbmi))
 653     vp9_encode_block_intra(x, plane, block, plane_bsize, tx_size, &mbmi->skip);
 654   else
 655     vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
 656
 657   dist_block(plane, block, tx_size, args);
 658   rate_block(plane, block, plane_bsize, tx_size, args);
 659   rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist);
 660   rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse);
 661
 662   // TODO(jingning): temporarily enabled only for luma component
 663   rd = MIN(rd1, rd2);
 664   if (plane == 0)
 665     x->zcoeff_blk[tx_size][block] = !x->plane[plane].eobs[block] ||
 666                                     (rd1 > rd2 && !xd->lossless);
 667
 668   args->this_rate += args->rate;
 669   args->this_dist += args->dist;
 670   args->this_sse  += args->sse;
 671   args->this_rd += rd;
 672
 673   if (args->this_rd > args->best_rd) {
 674     args->skip = 1;
 675     return;
 676   }
 677 }
 678
 679 void vp9_get_entropy_contexts(TX_SIZE tx_size,
 680     ENTROPY_CONTEXT t_above[16], ENTROPY_CONTEXT t_left[16],
 681     const ENTROPY_CONTEXT *above, const ENTROPY_CONTEXT *left,
 682     int num_4x4_w, int num_4x4_h) {
 683   int i;
 684   switch (tx_size) {
 685     case TX_4X4:
 686       vpx_memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
 687       vpx_memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
 688       break;
 689     case TX_8X8:
 690       for (i = 0; i < num_4x4_w; i += 2)
 691         t_above[i] = !!*(const uint16_t *)&above[i];
 692       for (i = 0; i < num_4x4_h; i += 2)
 693         t_left[i] = !!*(const uint16_t *)&left[i];
 694       break;
 695     case TX_16X16:
 696       for (i = 0; i < num_4x4_w; i += 4)
 697         t_above[i] = !!*(const uint32_t *)&above[i];
 698       for (i = 0; i < num_4x4_h; i += 4)
 699         t_left[i] = !!*(const uint32_t *)&left[i];
 700       break;
 701     case TX_32X32:
 702       for (i = 0; i < num_4x4_w; i += 8)
 703         t_above[i] = !!*(const uint64_t *)&above[i];
 704       for (i = 0; i < num_4x4_h; i += 8)
 705         t_left[i] = !!*(const uint64_t *)&left[i];
 706       break;
 707     default:
 708       assert(0 && "Invalid transform size.");
 709   }
 710 }
 711
 712 static void txfm_rd_in_plane(MACROBLOCK *x,
 713                              int *rate, int64_t *distortion,
 714                              int *skippable, int64_t *sse,
 715                              int64_t ref_best_rd, int plane,
 716                              BLOCK_SIZE bsize, TX_SIZE tx_size) {
 717   MACROBLOCKD *const xd = &x->e_mbd;
 718   struct macroblockd_plane *const pd = &xd->plane[plane];
 719   const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
 720   const int num_4x4_w = num_4x4_blocks_wide_lookup[bs];
 721   const int num_4x4_h = num_4x4_blocks_high_lookup[bs];
 722   struct rdcost_block_args args = { 0 };
 723   args.x = x;
 724   args.best_rd = ref_best_rd;
 725
 726   if (plane == 0)
 727     xd->mi_8x8[0]->mbmi.tx_size = tx_size;
 728
 729   vp9_get_entropy_contexts(tx_size, args.t_above, args.t_left,
 730                            pd->above_context, pd->left_context,
 731                            num_4x4_w, num_4x4_h);
 732
 733   args.so = get_scan(xd, tx_size, pd->plane_type, 0);
 734
 735   vp9_foreach_transformed_block_in_plane(xd, bsize, plane,
 736                                          block_rd_txfm, &args);
 737   if (args.skip) {
 738     *rate       = INT_MAX;
 739     *distortion = INT64_MAX;
 740     *sse        = INT64_MAX;
 741     *skippable  = 0;
 742   } else {
 743     *distortion = args.this_dist;
 744     *rate       = args.this_rate;
 745     *sse        = args.this_sse;
 746     *skippable  = vp9_is_skippable_in_plane(x, bsize, plane);
 747   }
 748 }
 749
 750 static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x,
 751                                      int *rate, int64_t *distortion,
 752                                      int *skip, int64_t *sse,
 753                                      int64_t ref_best_rd,
 754                                      BLOCK_SIZE bs) {
 755   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
 756   VP9_COMMON *const cm = &cpi->common;
 757   const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
 758   MACROBLOCKD *const xd = &x->e_mbd;
 759   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
 760
 761   mbmi->tx_size = MIN(max_tx_size, largest_tx_size);
 762
 763   txfm_rd_in_plane(x, rate, distortion, skip,
 764                    &sse[mbmi->tx_size], ref_best_rd, 0, bs,
 765                    mbmi->tx_size);
 766   cpi->tx_stepdown_count[0]++;
 767 }
 768
 769 static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
 770                                      int (*r)[2], int *rate,
 771                                      int64_t *d, int64_t *distortion,
 772                                      int *s, int *skip,
 773                                      int64_t tx_cache[TX_MODES],
 774                                      BLOCK_SIZE bs) {
 775   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
 776   VP9_COMMON *const cm = &cpi->common;
 777   MACROBLOCKD *const xd = &x->e_mbd;
 778   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
 779   vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
 780   int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX},
 781                              {INT64_MAX, INT64_MAX},
 782                              {INT64_MAX, INT64_MAX},
 783                              {INT64_MAX, INT64_MAX}};
 784   int n, m;
 785   int s0, s1;
 786   const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
 787   int64_t best_rd = INT64_MAX;
 788   TX_SIZE best_tx = TX_4X4;
 789
 790   const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
 791   assert(skip_prob > 0);
 792   s0 = vp9_cost_bit(skip_prob, 0);
 793   s1 = vp9_cost_bit(skip_prob, 1);
 794
 795   for (n = TX_4X4; n <= max_tx_size; n++) {
 796     r[n][1] = r[n][0];
 797     if (r[n][0] < INT_MAX) {
 798       for (m = 0; m <= n - (n == max_tx_size); m++) {
 799         if (m == n)
 800           r[n][1] += vp9_cost_zero(tx_probs[m]);
 801         else
 802           r[n][1] += vp9_cost_one(tx_probs[m]);
 803       }
 804     }
 805     if (d[n] == INT64_MAX) {
 806       rd[n][0] = rd[n][1] = INT64_MAX;
 807     } else if (s[n]) {
 808       rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
 809     } else {
 810       rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
 811       rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
 812     }
 813
 814     if (rd[n][1] < best_rd) {
 815       best_tx = n;
 816       best_rd = rd[n][1];
 817     }
 818   }
 819   mbmi->tx_size = cm->tx_mode == TX_MODE_SELECT ?
 820                       best_tx : MIN(max_tx_size, max_mode_tx_size);
 821
 822
 823   *distortion = d[mbmi->tx_size];
 824   *rate       = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT];
 825   *skip       = s[mbmi->tx_size];
 826
 827   tx_cache[ONLY_4X4] = rd[TX_4X4][0];
 828   tx_cache[ALLOW_8X8] = rd[TX_8X8][0];
 829   tx_cache[ALLOW_16X16] = rd[MIN(max_tx_size, TX_16X16)][0];
 830   tx_cache[ALLOW_32X32] = rd[MIN(max_tx_size, TX_32X32)][0];
 831
 832   if (max_tx_size == TX_32X32 && best_tx == TX_32X32) {
 833     tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
 834     cpi->tx_stepdown_count[0]++;
 835   } else if (max_tx_size >= TX_16X16 && best_tx == TX_16X16) {
 836     tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
 837     cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
 838   } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {
 839     tx_cache[TX_MODE_SELECT] = rd[TX_8X8][1];
 840     cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
 841   } else {
 842     tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1];
 843     cpi->tx_stepdown_count[max_tx_size - TX_4X4]++;
 844   }
 845 }
 846
 847 static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
 848                                           int (*r)[2], int *rate,
 849                                           int64_t *d, int64_t *distortion,
 850                                           int *s, int *skip, int64_t *sse,
 851                                           int64_t ref_best_rd,
 852                                           BLOCK_SIZE bs) {
 853   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
 854   VP9_COMMON *const cm = &cpi->common;
 855   MACROBLOCKD *const xd = &x->e_mbd;
 856   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
 857   vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
 858   int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX},
 859                              {INT64_MAX, INT64_MAX},
 860                              {INT64_MAX, INT64_MAX},
 861                              {INT64_MAX, INT64_MAX}};
 862   int n, m;
 863   int s0, s1;
 864   double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00};
 865   const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
 866   int64_t best_rd = INT64_MAX;
 867   TX_SIZE best_tx = TX_4X4;
 868
 869   const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
 870   assert(skip_prob > 0);
 871   s0 = vp9_cost_bit(skip_prob, 0);
 872   s1 = vp9_cost_bit(skip_prob, 1);
 873
 874   for (n = TX_4X4; n <= max_tx_size; n++) {
 875     double scale = scale_rd[n];
 876     r[n][1] = r[n][0];
 877     for (m = 0; m <= n - (n == max_tx_size); m++) {
 878       if (m == n)
 879         r[n][1] += vp9_cost_zero(tx_probs[m]);
 880       else
 881         r[n][1] += vp9_cost_one(tx_probs[m]);
 882     }
 883     if (s[n]) {
 884       rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]) * scale;
 885     } else {
 886       rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]) * scale;
 887       rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]) * scale;
 888     }
 889     if (rd[n][1] < best_rd) {
 890       best_rd = rd[n][1];
 891       best_tx = n;
 892     }
 893   }
 894
 895   mbmi->tx_size = cm->tx_mode == TX_MODE_SELECT ?
 896                       best_tx : MIN(max_tx_size, max_mode_tx_size);
 897
 898   // Actually encode using the chosen mode if a model was used, but do not
 899   // update the r, d costs
 900   txfm_rd_in_plane(x, rate, distortion, skip,
 901                    &sse[mbmi->tx_size], ref_best_rd, 0, bs, mbmi->tx_size);
 902
 903   if (max_tx_size == TX_32X32 && best_tx == TX_32X32) {
 904     cpi->tx_stepdown_count[0]++;
 905   } else if (max_tx_size >= TX_16X16 &&  best_tx == TX_16X16) {
 906     cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
 907   } else if (rd[TX_8X8][1] <= rd[TX_4X4][1]) {
 908     cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
 909   } else {
 910     cpi->tx_stepdown_count[max_tx_size - TX_4X4]++;
 911   }
 912 }
 913
 914 static void super_block_yrd(VP9_COMP *cpi,
 915                             MACROBLOCK *x, int *rate, int64_t *distortion,
 916                             int *skip, int64_t *psse, BLOCK_SIZE bs,
 917                             int64_t txfm_cache[TX_MODES],
 918                             int64_t ref_best_rd) {
 919   int r[TX_SIZES][2], s[TX_SIZES];
 920   int64_t d[TX_SIZES], sse[TX_SIZES];
 921   MACROBLOCKD *xd = &x->e_mbd;
 922   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
 923   const int b_inter_mode = is_inter_block(mbmi);
 924   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
 925   TX_SIZE tx_size;
 926
 927
 928   assert(bs == mbmi->sb_type);
 929   if (b_inter_mode)
 930     vp9_subtract_sby(x, bs);
 931
 932   if (cpi->sf.tx_size_search_method == USE_LARGESTALL ||
 933       (cpi->sf.tx_size_search_method != USE_FULL_RD &&
 934        !b_inter_mode)) {
 935     vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
 936     choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
 937                              ref_best_rd, bs);
 938     if (psse)
 939       *psse = sse[mbmi->tx_size];
 940     return;
 941   }
 942
 943   if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER &&
 944       b_inter_mode) {
 945     for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
 946       model_rd_for_sb_y_tx(cpi, bs, tx_size, x, xd,
 947                            &r[tx_size][0], &d[tx_size], &s[tx_size]);
 948     choose_txfm_size_from_modelrd(cpi, x, r, rate, d, distortion, s,
 949                                   skip, sse, ref_best_rd, bs);
 950   } else {
 951     for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
 952       txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size],
 953                        &s[tx_size], &sse[tx_size],
 954                        ref_best_rd, 0, bs, tx_size);
 955     choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
 956                              skip, txfm_cache, bs);
 957   }
 958   if (psse)
 959     *psse = sse[mbmi->tx_size];
 960 }
 961
 962 static int conditional_skipintra(MB_PREDICTION_MODE mode,
 963                                  MB_PREDICTION_MODE best_intra_mode) {
 964   if (mode == D117_PRED &&
 965       best_intra_mode != V_PRED &&
 966       best_intra_mode != D135_PRED)
 967     return 1;
 968   if (mode == D63_PRED &&
 969       best_intra_mode != V_PRED &&
 970       best_intra_mode != D45_PRED)
 971     return 1;
 972   if (mode == D207_PRED &&
 973       best_intra_mode != H_PRED &&
 974       best_intra_mode != D45_PRED)
 975     return 1;
 976   if (mode == D153_PRED &&
 977       best_intra_mode != H_PRED &&
 978       best_intra_mode != D135_PRED)
 979     return 1;
 980   return 0;
 981 }
 982
 983 static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
 984                                      MB_PREDICTION_MODE *best_mode,
 985                                      int *bmode_costs,
 986                                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
 987                                      int *bestrate, int *bestratey,
 988                                      int64_t *bestdistortion,
 989                                      BLOCK_SIZE bsize, int64_t rd_thresh) {
 990   MB_PREDICTION_MODE mode;
 991   MACROBLOCKD *xd = &x->e_mbd;
 992   int64_t best_rd = rd_thresh;
 993   int rate = 0;
 994   int64_t distortion;
 995   struct macroblock_plane *p = &x->plane[0];
 996   struct macroblockd_plane *pd = &xd->plane[0];
 997   const int src_stride = p->src.stride;
 998   const int dst_stride = pd->dst.stride;
 999   const uint8_t *src_init = &p->src.buf[raster_block_offset(BLOCK_8X8, ib,
1000                                                             src_stride)];
1001   uint8_t *dst_init = &pd->dst.buf[raster_block_offset(BLOCK_8X8, ib,
1002                                                        dst_stride)];
1003   int16_t *src_diff, *coeff;
1004
1005   ENTROPY_CONTEXT ta[2], tempa[2];
1006   ENTROPY_CONTEXT tl[2], templ[2];
1007
1008   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
1009   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
1010   int idx, idy;
1011   uint8_t best_dst[8 * 8];
1012
1013   assert(ib < 4);
1014
1015   vpx_memcpy(ta, a, sizeof(ta));
1016   vpx_memcpy(tl, l, sizeof(tl));
1017   xd->mi_8x8[0]->mbmi.tx_size = TX_4X4;
1018
1019   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
1020     int64_t this_rd;
1021     int ratey = 0;
1022
1023     if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode)))
1024       continue;
1025
1026     // Only do the oblique modes if the best so far is
1027     // one of the neighboring directional modes
1028     if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
1029       if (conditional_skipintra(mode, *best_mode))
1030           continue;
1031     }
1032
1033     rate = bmode_costs[mode];
1034     distortion = 0;
1035
1036     vpx_memcpy(tempa, ta, sizeof(ta));
1037     vpx_memcpy(templ, tl, sizeof(tl));
1038
1039     for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
1040       for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
1041         int64_t ssz;
1042         const scan_order *so;
1043         const uint8_t *src = src_init + idx * 4 + idy * 4 * src_stride;
1044         uint8_t *dst = dst_init + idx * 4 + idy * 4 * dst_stride;
1045         const int block = ib + idy * 2 + idx;
1046         TX_TYPE tx_type;
1047         xd->mi_8x8[0]->bmi[block].as_mode = mode;
1048         src_diff = raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
1049         coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
1050         vp9_predict_intra_block(xd, block, 1,
1051                                 TX_4X4, mode,
1052                                 x->skip_encode ? src : dst,
1053                                 x->skip_encode ? src_stride : dst_stride,
1054                                 dst, dst_stride, idx, idy, 0);
1055         vp9_subtract_block(4, 4, src_diff, 8,
1056                            src, src_stride,
1057                            dst, dst_stride);
1058
1059         tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
1060         so = &vp9_scan_orders[TX_4X4][tx_type];
1061
1062         if (tx_type != DCT_DCT)
1063           vp9_fht4x4(src_diff, coeff, 8, tx_type);
1064         else
1065           x->fwd_txm4x4(src_diff, coeff, 8);
1066
1067         vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
1068
1069         ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
1070                              so->scan, so->neighbors);
1071         distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
1072                                       16, &ssz) >> 2;
1073         if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
1074           goto next;
1075
1076         if (tx_type != DCT_DCT)
1077           vp9_iht4x4_16_add(BLOCK_OFFSET(pd->dqcoeff, block),
1078                                dst, pd->dst.stride, tx_type);
1079         else
1080           xd->itxm_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, pd->dst.stride,
1081                        16);
1082       }
1083     }
1084
1085     rate += ratey;
1086     this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
1087
1088     if (this_rd < best_rd) {
1089       *bestrate = rate;
1090       *bestratey = ratey;
1091       *bestdistortion = distortion;
1092       best_rd = this_rd;
1093       *best_mode = mode;
1094       vpx_memcpy(a, tempa, sizeof(tempa));
1095       vpx_memcpy(l, templ, sizeof(templ));
1096       for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
1097         vpx_memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
1098                    num_4x4_blocks_wide * 4);
1099     }
1100   next:
1101     {}
1102   }
1103
1104   if (best_rd >= rd_thresh || x->skip_encode)
1105     return best_rd;
1106
1107   for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
1108     vpx_memcpy(dst_init + idy * dst_stride, best_dst + idy * 8,
1109                num_4x4_blocks_wide * 4);
1110
1111   return best_rd;
1112 }
1113
1114 static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP * const cpi,
1115                                             MACROBLOCK * const mb,
1116                                             int * const rate,
1117                                             int * const rate_y,
1118                                             int64_t * const distortion,
1119                                             int64_t best_rd) {
1120   int i, j;
1121   MACROBLOCKD *const xd = &mb->e_mbd;
1122   MODE_INFO *const mic = xd->mi_8x8[0];
1123   const MODE_INFO *above_mi = xd->mi_8x8[-xd->mode_info_stride];
1124   const MODE_INFO *left_mi = xd->left_available ? xd->mi_8x8[-1] : NULL;
1125   const BLOCK_SIZE bsize = xd->mi_8x8[0]->mbmi.sb_type;
1126   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
1127   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
1128   int idx, idy;
1129   int cost = 0;
1130   int64_t total_distortion = 0;
1131   int tot_rate_y = 0;
1132   int64_t total_rd = 0;
1133   ENTROPY_CONTEXT t_above[4], t_left[4];
1134   int *bmode_costs;
1135
1136   vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
1137   vpx_memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
1138
1139   bmode_costs = mb->mbmode_cost;
1140
1141   // Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block.
1142   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
1143     for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
1144       MB_PREDICTION_MODE best_mode = DC_PRED;
1145       int r = INT_MAX, ry = INT_MAX;
1146       int64_t d = INT64_MAX, this_rd = INT64_MAX;
1147       i = idy * 2 + idx;
1148       if (cpi->common.frame_type == KEY_FRAME) {
1149         const MB_PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, i);
1150         const MB_PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, i);
1151
1152         bmode_costs  = mb->y_mode_costs[A][L];
1153       }
1154
1155       this_rd = rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
1156                                       t_above + idx, t_left + idy, &r, &ry, &d,
1157                                       bsize, best_rd - total_rd);
1158       if (this_rd >= best_rd - total_rd)
1159         return INT64_MAX;
1160
1161       total_rd += this_rd;
1162       cost += r;
1163       total_distortion += d;
1164       tot_rate_y += ry;
1165
1166       mic->bmi[i].as_mode = best_mode;
1167       for (j = 1; j < num_4x4_blocks_high; ++j)
1168         mic->bmi[i + j * 2].as_mode = best_mode;
1169       for (j = 1; j < num_4x4_blocks_wide; ++j)
1170         mic->bmi[i + j].as_mode = best_mode;
1171
1172       if (total_rd >= best_rd)
1173         return INT64_MAX;
1174     }
1175   }
1176
1177   *rate = cost;
1178   *rate_y = tot_rate_y;
1179   *distortion = total_distortion;
1180   mic->mbmi.mode = mic->bmi[3].as_mode;
1181
1182   return RDCOST(mb->rdmult, mb->rddiv, cost, total_distortion);
1183 }
1184
1185 static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
1186                                       int *rate, int *rate_tokenonly,
1187                                       int64_t *distortion, int *skippable,
1188                                       BLOCK_SIZE bsize,
1189                                       int64_t tx_cache[TX_MODES],
1190                                       int64_t best_rd) {
1191   MB_PREDICTION_MODE mode;
1192   MB_PREDICTION_MODE mode_selected = DC_PRED;
1193   MACROBLOCKD *const xd = &x->e_mbd;
1194   MODE_INFO *const mic = xd->mi_8x8[0];
1195   int this_rate, this_rate_tokenonly, s;
1196   int64_t this_distortion, this_rd;
1197   TX_SIZE best_tx = TX_4X4;
1198   int i;
1199   int *bmode_costs = x->mbmode_cost;
1200
1201   if (cpi->sf.tx_size_search_method == USE_FULL_RD)
1202     for (i = 0; i < TX_MODES; i++)
1203       tx_cache[i] = INT64_MAX;
1204
1205   /* Y Search for intra prediction mode */
1206   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
1207     int64_t local_tx_cache[TX_MODES];
1208     MODE_INFO *above_mi = xd->mi_8x8[-xd->mode_info_stride];
1209     MODE_INFO *left_mi = xd->left_available ? xd->mi_8x8[-1] : NULL;
1210
1211     if (!(cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]] & (1 << mode)))
1212       continue;
1213
1214     if (cpi->common.frame_type == KEY_FRAME) {
1215       const MB_PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
1216       const MB_PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
1217
1218       bmode_costs = x->y_mode_costs[A][L];
1219     }
1220     mic->mbmi.mode = mode;
1221
1222     super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL,
1223                     bsize, local_tx_cache, best_rd);
1224
1225     if (this_rate_tokenonly == INT_MAX)
1226       continue;
1227
1228     this_rate = this_rate_tokenonly + bmode_costs[mode];
1229     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
1230
1231     if (this_rd < best_rd) {
1232       mode_selected   = mode;
1233       best_rd         = this_rd;
1234       best_tx         = mic->mbmi.tx_size;
1235       *rate           = this_rate;
1236       *rate_tokenonly = this_rate_tokenonly;
1237       *distortion     = this_distortion;
1238       *skippable      = s;
1239     }
1240
1241     if (cpi->sf.tx_size_search_method == USE_FULL_RD && this_rd < INT64_MAX) {
1242       for (i = 0; i < TX_MODES && local_tx_cache[i] < INT64_MAX; i++) {
1243         const int64_t adj_rd = this_rd + local_tx_cache[i] -
1244             local_tx_cache[cpi->common.tx_mode];
1245         if (adj_rd < tx_cache[i]) {
1246           tx_cache[i] = adj_rd;
1247         }
1248       }
1249     }
1250   }
1251
1252   mic->mbmi.mode = mode_selected;
1253   mic->mbmi.tx_size = best_tx;
1254
1255   return best_rd;
1256 }
1257
1258 static void super_block_uvrd(VP9_COMP *const cpi, MACROBLOCK *x,
1259                              int *rate, int64_t *distortion, int *skippable,
1260                              int64_t *sse, BLOCK_SIZE bsize,
1261                              int64_t ref_best_rd) {
1262   MACROBLOCKD *const xd = &x->e_mbd;
1263   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
1264   TX_SIZE uv_txfm_size = get_uv_tx_size(mbmi);
1265   int plane;
1266   int pnrate = 0, pnskip = 1;
1267   int64_t pndist = 0, pnsse = 0;
1268
1269   if (ref_best_rd < 0)
1270     goto term;
1271
1272   if (is_inter_block(mbmi))
1273     vp9_subtract_sbuv(x, bsize);
1274
1275   *rate = 0;
1276   *distortion = 0;
1277   *sse = 0;
1278   *skippable = 1;
1279
1280   for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
1281     txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse,
1282                      ref_best_rd, plane, bsize, uv_txfm_size);
1283     if (pnrate == INT_MAX)
1284       goto term;
1285     *rate += pnrate;
1286     *distortion += pndist;
1287     *sse += pnsse;
1288     *skippable &= pnskip;
1289   }
1290   return;
1291
1292   term:
1293   *rate = INT_MAX;
1294   *distortion = INT64_MAX;
1295   *sse = INT64_MAX;
1296   *skippable = 0;
1297   return;
1298 }
1299
1300 static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
1301                                        PICK_MODE_CONTEXT *ctx,
1302                                        int *rate, int *rate_tokenonly,
1303                                        int64_t *distortion, int *skippable,
1304                                        BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
1305   MB_PREDICTION_MODE mode;
1306   MB_PREDICTION_MODE mode_selected = DC_PRED;
1307   int64_t best_rd = INT64_MAX, this_rd;
1308   int this_rate_tokenonly, this_rate, s;
1309   int64_t this_distortion, this_sse;
1310
1311   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
1312     if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode)))
1313       continue;
1314
1315     x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode;
1316
1317     super_block_uvrd(cpi, x, &this_rate_tokenonly,
1318                      &this_distortion, &s, &this_sse, bsize, best_rd);
1319     if (this_rate_tokenonly == INT_MAX)
1320       continue;
1321     this_rate = this_rate_tokenonly +
1322                 x->intra_uv_mode_cost[cpi->common.frame_type][mode];
1323     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
1324
1325     if (this_rd < best_rd) {
1326       mode_selected   = mode;
1327       best_rd         = this_rd;
1328       *rate           = this_rate;
1329       *rate_tokenonly = this_rate_tokenonly;
1330       *distortion     = this_distortion;
1331       *skippable      = s;
1332       if (!x->select_txfm_size) {
1333         int i;
1334         struct macroblock_plane *const p = x->plane;
1335         struct macroblockd_plane *const pd = x->e_mbd.plane;
1336         for (i = 1; i < MAX_MB_PLANE; ++i) {
1337           p[i].coeff    = ctx->coeff_pbuf[i][2];
1338           p[i].qcoeff   = ctx->qcoeff_pbuf[i][2];
1339           pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
1340           p[i].eobs    = ctx->eobs_pbuf[i][2];
1341
1342           ctx->coeff_pbuf[i][2]   = ctx->coeff_pbuf[i][0];
1343           ctx->qcoeff_pbuf[i][2]  = ctx->qcoeff_pbuf[i][0];
1344           ctx->dqcoeff_pbuf[i][2] = ctx->dqcoeff_pbuf[i][0];
1345           ctx->eobs_pbuf[i][2]    = ctx->eobs_pbuf[i][0];
1346
1347           ctx->coeff_pbuf[i][0]   = p[i].coeff;
1348           ctx->qcoeff_pbuf[i][0]  = p[i].qcoeff;
1349           ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff;
1350           ctx->eobs_pbuf[i][0]    = p[i].eobs;
1351         }
1352       }
1353     }
1354   }
1355
1356   x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode_selected;
1357   return best_rd;
1358 }
1359
1360 static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x,
1361                               int *rate, int *rate_tokenonly,
1362                               int64_t *distortion, int *skippable,
1363                               BLOCK_SIZE bsize) {
1364   int64_t this_rd;
1365   int64_t this_sse;
1366
1367   x->e_mbd.mi_8x8[0]->mbmi.uv_mode = DC_PRED;
1368   super_block_uvrd(cpi, x, rate_tokenonly, distortion,
1369                    skippable, &this_sse, bsize, INT64_MAX);
1370   *rate = *rate_tokenonly +
1371           x->intra_uv_mode_cost[cpi->common.frame_type][DC_PRED];
1372   this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
1373
1374   return this_rd;
1375 }
1376
1377 static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
1378                                  BLOCK_SIZE bsize, TX_SIZE max_tx_size,
1379                                  int *rate_uv, int *rate_uv_tokenonly,
1380                                  int64_t *dist_uv, int *skip_uv,
1381                                  MB_PREDICTION_MODE *mode_uv) {
1382   MACROBLOCK *const x = &cpi->mb;
1383
1384   // Use an estimated rd for uv_intra based on DC_PRED if the
1385   // appropriate speed flag is set.
1386   if (cpi->sf.use_uv_intra_rd_estimate) {
1387     rd_sbuv_dcpred(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
1388                    bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
1389   // Else do a proper rd search for each possible transform size that may
1390   // be considered in the main rd loop.
1391   } else {
1392     rd_pick_intra_sbuv_mode(cpi, x, ctx,
1393                             rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
1394                             bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize, max_tx_size);
1395   }
1396   *mode_uv = x->e_mbd.mi_8x8[0]->mbmi.uv_mode;
1397 }
1398
1399 static int cost_mv_ref(VP9_COMP *cpi, MB_PREDICTION_MODE mode,
1400                        int mode_context) {
1401   MACROBLOCK *const x = &cpi->mb;
1402   MACROBLOCKD *const xd = &x->e_mbd;
1403   const int segment_id = xd->mi_8x8[0]->mbmi.segment_id;
1404
1405   // Don't account for mode here if segment skip is enabled.
1406   if (!vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
1407     assert(is_inter_mode(mode));
1408     return x->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
1409   } else {
1410     return 0;
1411   }
1412 }
1413
1414 void vp9_set_mbmode_and_mvs(MACROBLOCKD *xd, MB_PREDICTION_MODE mode,
1415                             const MV *mv) {
1416   xd->mi_8x8[0]->mbmi.mode = mode;
1417   xd->mi_8x8[0]->mbmi.mv[0].as_mv = *mv;
1418 }
1419
1420 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
1421                                 BLOCK_SIZE bsize,
1422                                 int_mv *frame_mv,
1423                                 int mi_row, int mi_col,
1424                                 int_mv single_newmv[MAX_REF_FRAMES],
1425                                 int *rate_mv);
1426
1427 static int labels2mode(MACROBLOCK *x, int i,
1428                        MB_PREDICTION_MODE this_mode,
1429                        int_mv *this_mv, int_mv *this_second_mv,
1430                        int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
1431                        int_mv seg_mvs[MAX_REF_FRAMES],
1432                        int_mv *best_ref_mv,
1433                        int_mv *second_best_ref_mv,
1434                        int *mvjcost, int *mvcost[2], VP9_COMP *cpi) {
1435   MACROBLOCKD *const xd = &x->e_mbd;
1436   MODE_INFO *const mic = xd->mi_8x8[0];
1437   MB_MODE_INFO *mbmi = &mic->mbmi;
1438   int cost = 0, thismvcost = 0;
1439   int idx, idy;
1440   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
1441   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
1442   const int has_second_rf = has_second_ref(mbmi);
1443
1444   /* We have to be careful retrieving previously-encoded motion vectors.
1445    Ones from this macroblock have to be pulled from the BLOCKD array
1446    as they have not yet made it to the bmi array in our MB_MODE_INFO. */
1447   MB_PREDICTION_MODE m;
1448
1449   // the only time we should do costing for new motion vector or mode
1450   // is when we are on a new label  (jbb May 08, 2007)
1451   switch (m = this_mode) {
1452     case NEWMV:
1453       this_mv->as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
1454       thismvcost  = vp9_mv_bit_cost(&this_mv->as_mv, &best_ref_mv->as_mv,
1455                                     mvjcost, mvcost, MV_COST_WEIGHT_SUB);
1456       if (has_second_rf) {
1457         this_second_mv->as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
1458         thismvcost += vp9_mv_bit_cost(&this_second_mv->as_mv,
1459                                       &second_best_ref_mv->as_mv,
1460                                       mvjcost, mvcost, MV_COST_WEIGHT_SUB);
1461       }
1462       break;
1463     case NEARESTMV:
1464       this_mv->as_int = frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int;
1465       if (has_second_rf)
1466         this_second_mv->as_int =
1467             frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int;
1468       break;
1469     case NEARMV:
1470       this_mv->as_int = frame_mv[NEARMV][mbmi->ref_frame[0]].as_int;
1471       if (has_second_rf)
1472         this_second_mv->as_int =
1473             frame_mv[NEARMV][mbmi->ref_frame[1]].as_int;
1474       break;
1475     case ZEROMV:
1476       this_mv->as_int = 0;
1477       if (has_second_rf)
1478         this_second_mv->as_int = 0;
1479       break;
1480     default:
1481       break;
1482   }
1483
1484   cost = cost_mv_ref(cpi, this_mode,
1485                      mbmi->mode_context[mbmi->ref_frame[0]]);
1486
1487   mic->bmi[i].as_mv[0].as_int = this_mv->as_int;
1488   if (has_second_rf)
1489     mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int;
1490
1491   mic->bmi[i].as_mode = m;
1492
1493   for (idy = 0; idy < num_4x4_blocks_high; ++idy)
1494     for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
1495       vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
1496                  &mic->bmi[i], sizeof(mic->bmi[i]));
1497
1498   cost += thismvcost;
1499   return cost;
1500 }
1501
1502 static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
1503                                        MACROBLOCK *x,
1504                                        int64_t best_yrd,
1505                                        int i,
1506                                        int *labelyrate,
1507                                        int64_t *distortion, int64_t *sse,
1508                                        ENTROPY_CONTEXT *ta,
1509                                        ENTROPY_CONTEXT *tl,
1510                                        int mi_row, int mi_col) {
1511   int k;
1512   MACROBLOCKD *xd = &x->e_mbd;
1513   struct macroblockd_plane *const pd = &xd->plane[0];
1514   struct macroblock_plane *const p = &x->plane[0];
1515   MODE_INFO *const mi = xd->mi_8x8[0];
1516   const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
1517   const int width = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
1518   const int height = 4 * num_4x4_blocks_high_lookup[plane_bsize];
1519   int idx, idy;
1520
1521   const uint8_t *const src = &p->src.buf[raster_block_offset(BLOCK_8X8, i,
1522                                                              p->src.stride)];
1523   uint8_t *const dst = &pd->dst.buf[raster_block_offset(BLOCK_8X8, i,
1524                                                         pd->dst.stride)];
1525   int64_t thisdistortion = 0, thissse = 0;
1526   int thisrate = 0, ref;
1527   const scan_order *so = &vp9_default_scan_orders[TX_4X4];
1528   const int is_compound = has_second_ref(&mi->mbmi);
1529   for (ref = 0; ref < 1 + is_compound; ++ref) {
1530     const uint8_t *pre = &pd->pre[ref].buf[raster_block_offset(BLOCK_8X8, i,
1531                                                pd->pre[ref].stride)];
1532     vp9_build_inter_predictor(pre, pd->pre[ref].stride,
1533                               dst, pd->dst.stride,
1534                               &mi->bmi[i].as_mv[ref].as_mv,
1535                               &xd->block_refs[ref]->sf, width, height, ref,
1536                               xd->interp_kernel, MV_PRECISION_Q3,
1537                               mi_col * MI_SIZE + 4 * (i % 2),
1538                               mi_row * MI_SIZE + 4 * (i / 2));
1539   }
1540
1541   vp9_subtract_block(height, width,
1542                      raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
1543                      src, p->src.stride,
1544                      dst, pd->dst.stride);
1545
1546   k = i;
1547   for (idy = 0; idy < height / 4; ++idy) {
1548     for (idx = 0; idx < width / 4; ++idx) {
1549       int64_t ssz, rd, rd1, rd2;
1550       int16_t* coeff;
1551
1552       k += (idy * 2 + idx);
1553       coeff = BLOCK_OFFSET(p->coeff, k);
1554       x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
1555                     coeff, 8);
1556       vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
1557       thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
1558                                         16, &ssz);
1559       thissse += ssz;
1560       thisrate += cost_coeffs(x, 0, k, ta + (k & 1), tl + (k >> 1), TX_4X4,
1561                               so->scan, so->neighbors);
1562       rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion >> 2);
1563       rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse >> 2);
1564       rd = MIN(rd1, rd2);
1565       if (rd >= best_yrd)
1566         return INT64_MAX;
1567     }
1568   }
1569
1570   *distortion = thisdistortion >> 2;
1571   *labelyrate = thisrate;
1572   *sse = thissse >> 2;
1573
1574   return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
1575 }
1576
1577 typedef struct {
1578   int eobs;
1579   int brate;
1580   int byrate;
1581   int64_t bdist;
1582   int64_t bsse;
1583   int64_t brdcost;
1584   int_mv mvs[2];
1585   ENTROPY_CONTEXT ta[2];
1586   ENTROPY_CONTEXT tl[2];
1587 } SEG_RDSTAT;
1588
1589 typedef struct {
1590   int_mv *ref_mv, *second_ref_mv;
1591   int_mv mvp;
1592
1593   int64_t segment_rd;
1594   int r;
1595   int64_t d;
1596   int64_t sse;
1597   int segment_yrate;
1598   MB_PREDICTION_MODE modes[4];
1599   SEG_RDSTAT rdstat[4][INTER_MODES];
1600   int mvthresh;
1601 } BEST_SEG_INFO;
1602
1603 static INLINE int mv_check_bounds(const MACROBLOCK *x, const MV *mv) {
1604   return (mv->row >> 3) < x->mv_row_min ||
1605          (mv->row >> 3) > x->mv_row_max ||
1606          (mv->col >> 3) < x->mv_col_min ||
1607          (mv->col >> 3) > x->mv_col_max;
1608 }
1609
1610 static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
1611   MB_MODE_INFO *const mbmi = &x->e_mbd.mi_8x8[0]->mbmi;
1612   struct macroblock_plane *const p = &x->plane[0];
1613   struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
1614
1615   p->src.buf = &p->src.buf[raster_block_offset(BLOCK_8X8, i, p->src.stride)];
1616   assert(((intptr_t)pd->pre[0].buf & 0x7) == 0);
1617   pd->pre[0].buf = &pd->pre[0].buf[raster_block_offset(BLOCK_8X8, i,
1618                                                        pd->pre[0].stride)];
1619   if (has_second_ref(mbmi))
1620     pd->pre[1].buf = &pd->pre[1].buf[raster_block_offset(BLOCK_8X8, i,
1621                                                          pd->pre[1].stride)];
1622 }
1623
1624 static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
1625                                   struct buf_2d orig_pre[2]) {
1626   MB_MODE_INFO *mbmi = &x->e_mbd.mi_8x8[0]->mbmi;
1627   x->plane[0].src = orig_src;
1628   x->e_mbd.plane[0].pre[0] = orig_pre[0];
1629   if (has_second_ref(mbmi))
1630     x->e_mbd.plane[0].pre[1] = orig_pre[1];
1631 }
1632
1633 static INLINE int mv_has_subpel(const MV *mv) {
1634   return (mv->row & 0x0F) || (mv->col & 0x0F);
1635 }
1636
1637 static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
1638                                     const TileInfo *const tile,
1639                                     BEST_SEG_INFO *bsi_buf, int filter_idx,
1640                                     int_mv seg_mvs[4][MAX_REF_FRAMES],
1641                                     int mi_row, int mi_col) {
1642   int k, br = 0, idx, idy;
1643   int64_t bd = 0, block_sse = 0;
1644   MB_PREDICTION_MODE this_mode;
1645   MACROBLOCKD *xd = &x->e_mbd;
1646   VP9_COMMON *cm = &cpi->common;
1647   MODE_INFO *mi = xd->mi_8x8[0];
1648   MB_MODE_INFO *const mbmi = &mi->mbmi;
1649   struct macroblock_plane *const p = &x->plane[0];
1650   struct macroblockd_plane *const pd = &xd->plane[0];
1651   const int label_count = 4;
1652   int64_t this_segment_rd = 0;
1653   int label_mv_thresh;
1654   int segmentyrate = 0;
1655   const BLOCK_SIZE bsize = mbmi->sb_type;
1656   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
1657   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
1658   vp9_variance_fn_ptr_t *v_fn_ptr = &cpi->fn_ptr[bsize];
1659   ENTROPY_CONTEXT t_above[2], t_left[2];
1660   BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
1661   int mode_idx;
1662   int subpelmv = 1, have_ref = 0;
1663   const int has_second_rf = has_second_ref(mbmi);
1664
1665   vpx_memcpy(t_above, pd->above_context, sizeof(t_above));
1666   vpx_memcpy(t_left, pd->left_context, sizeof(t_left));
1667
1668   // 64 makes this threshold really big effectively
1669   // making it so that we very rarely check mvs on
1670   // segments.   setting this to 1 would make mv thresh
1671   // roughly equal to what it is for macroblocks
1672   label_mv_thresh = 1 * bsi->mvthresh / label_count;
1673
1674   // Segmentation method overheads
1675   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
1676     for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
1677       // TODO(jingning,rbultje): rewrite the rate-distortion optimization
1678       // loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop
1679       int_mv mode_mv[MB_MODE_COUNT], second_mode_mv[MB_MODE_COUNT];
1680       int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
1681       MB_PREDICTION_MODE mode_selected = ZEROMV;
1682       int64_t best_rd = INT64_MAX;
1683       const int i = idy * 2 + idx;
1684       int ref;
1685
1686       for (ref = 0; ref < 1 + has_second_rf; ++ref) {
1687         const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
1688         frame_mv[ZEROMV][frame].as_int = 0;
1689         vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, i, ref, mi_row, mi_col,
1690                                       &frame_mv[NEARESTMV][frame],
1691                                       &frame_mv[NEARMV][frame]);
1692       }
1693
1694       // search for the best motion vector on this segment
1695       for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
1696         const struct buf_2d orig_src = x->plane[0].src;
1697         struct buf_2d orig_pre[2];
1698
1699         mode_idx = INTER_OFFSET(this_mode);
1700         bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
1701
1702         // if we're near/nearest and mv == 0,0, compare to zeromv
1703         if ((this_mode == NEARMV || this_mode == NEARESTMV ||
1704              this_mode == ZEROMV) &&
1705             frame_mv[this_mode][mbmi->ref_frame[0]].as_int == 0 &&
1706             (!has_second_rf ||
1707              frame_mv[this_mode][mbmi->ref_frame[1]].as_int == 0)) {
1708           int rfc = mbmi->mode_context[mbmi->ref_frame[0]];
1709           int c1 = cost_mv_ref(cpi, NEARMV, rfc);
1710           int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
1711           int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
1712
1713           if (this_mode == NEARMV) {
1714             if (c1 > c3)
1715               continue;
1716           } else if (this_mode == NEARESTMV) {
1717             if (c2 > c3)
1718               continue;
1719           } else {
1720             assert(this_mode == ZEROMV);
1721             if (!has_second_rf) {
1722               if ((c3 >= c2 &&
1723                    frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0) ||
1724                   (c3 >= c1 &&
1725                    frame_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0))
1726                 continue;
1727             } else {
1728               if ((c3 >= c2 &&
1729                    frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0 &&
1730                    frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int == 0) ||
1731                   (c3 >= c1 &&
1732                    frame_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0 &&
1733                    frame_mv[NEARMV][mbmi->ref_frame[1]].as_int == 0))
1734                 continue;
1735             }
1736           }
1737         }
1738
1739         vpx_memcpy(orig_pre, pd->pre, sizeof(orig_pre));
1740         vpx_memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
1741                    sizeof(bsi->rdstat[i][mode_idx].ta));
1742         vpx_memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
1743                    sizeof(bsi->rdstat[i][mode_idx].tl));
1744
1745         // motion search for newmv (single predictor case only)
1746         if (!has_second_rf && this_mode == NEWMV &&
1747             seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) {
1748           int_mv *const new_mv = &mode_mv[NEWMV];
1749           int step_param = 0;
1750           int further_steps;
1751           int thissme, bestsme = INT_MAX;
1752           int sadpb = x->sadperbit4;
1753           MV mvp_full;
1754           int max_mv;
1755
1756           /* Is the best so far sufficiently good that we cant justify doing
1757            * and new motion search. */
1758           if (best_rd < label_mv_thresh)
1759             break;
1760
1761           if (cpi->oxcf.mode != MODE_SECONDPASS_BEST &&
1762               cpi->oxcf.mode != MODE_BESTQUALITY) {
1763             // use previous block's result as next block's MV predictor.
1764             if (i > 0) {
1765               bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int;
1766               if (i == 2)
1767                 bsi->mvp.as_int = mi->bmi[i - 2].as_mv[0].as_int;
1768             }
1769           }
1770           if (i == 0)
1771             max_mv = x->max_mv_context[mbmi->ref_frame[0]];
1772           else
1773             max_mv = MAX(abs(bsi->mvp.as_mv.row), abs(bsi->mvp.as_mv.col)) >> 3;
1774
1775           if (cpi->sf.auto_mv_step_size && cm->show_frame) {
1776             // Take wtd average of the step_params based on the last frame's
1777             // max mv magnitude and the best ref mvs of the current block for
1778             // the given reference.
1779             step_param = (vp9_init_search_range(cpi, max_mv) +
1780                           cpi->mv_step_param) >> 1;
1781           } else {
1782             step_param = cpi->mv_step_param;
1783           }
1784
1785           mvp_full.row = bsi->mvp.as_mv.row >> 3;
1786           mvp_full.col = bsi->mvp.as_mv.col >> 3;
1787
1788           if (cpi->sf.adaptive_motion_search && cm->show_frame) {
1789             mvp_full.row = x->pred_mv[mbmi->ref_frame[0]].as_mv.row >> 3;
1790             mvp_full.col = x->pred_mv[mbmi->ref_frame[0]].as_mv.col >> 3;
1791             step_param = MAX(step_param, 8);
1792           }
1793
1794           further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
1795           // adjust src pointer for this block
1796           mi_buf_shift(x, i);
1797
1798           vp9_set_mv_search_range(x, &bsi->ref_mv->as_mv);
1799
1800           if (cpi->sf.search_method == HEX) {
1801             bestsme = vp9_hex_search(x, &mvp_full,
1802                                      step_param,
1803                                      sadpb, 1, v_fn_ptr, 1,
1804                                      &bsi->ref_mv->as_mv,
1805                                      &new_mv->as_mv);
1806           } else if (cpi->sf.search_method == SQUARE) {
1807             bestsme = vp9_square_search(x, &mvp_full,
1808                                         step_param,
1809                                         sadpb, 1, v_fn_ptr, 1,
1810                                         &bsi->ref_mv->as_mv,
1811                                         &new_mv->as_mv);
1812           } else if (cpi->sf.search_method == BIGDIA) {
1813             bestsme = vp9_bigdia_search(x, &mvp_full,
1814                                         step_param,
1815                                         sadpb, 1, v_fn_ptr, 1,
1816                                         &bsi->ref_mv->as_mv,
1817                                         &new_mv->as_mv);
1818           } else {
1819             bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
1820                                              sadpb, further_steps, 0, v_fn_ptr,
1821                                              &bsi->ref_mv->as_mv,
1822                                              &new_mv->as_mv);
1823           }
1824
1825           // Should we do a full search (best quality only)
1826           if (cpi->oxcf.mode == MODE_BESTQUALITY ||
1827               cpi->oxcf.mode == MODE_SECONDPASS_BEST) {
1828             /* Check if mvp_full is within the range. */
1829             clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
1830                      x->mv_row_min, x->mv_row_max);
1831
1832             thissme = cpi->full_search_sad(x, &mvp_full,
1833                                            sadpb, 16, v_fn_ptr,
1834                                            x->nmvjointcost, x->mvcost,
1835                                            &bsi->ref_mv->as_mv, i);
1836
1837             if (thissme < bestsme) {
1838               bestsme = thissme;
1839               new_mv->as_int = mi->bmi[i].as_mv[0].as_int;
1840             } else {
1841               /* The full search result is actually worse so re-instate the
1842                * previous best vector */
1843               mi->bmi[i].as_mv[0].as_int = new_mv->as_int;
1844             }
1845           }
1846
1847           if (bestsme < INT_MAX) {
1848             int distortion;
1849             cpi->find_fractional_mv_step(x,
1850                                          &new_mv->as_mv,
1851                                          &bsi->ref_mv->as_mv,
1852                                          cm->allow_high_precision_mv,
1853                                          x->errorperbit, v_fn_ptr,
1854                                          cpi->sf.subpel_force_stop,
1855                                          cpi->sf.subpel_iters_per_step,
1856                                          x->nmvjointcost, x->mvcost,
1857                                          &distortion,
1858                                          &x->pred_sse[mbmi->ref_frame[0]]);
1859
1860             // save motion search result for use in compound prediction
1861             seg_mvs[i][mbmi->ref_frame[0]].as_int = new_mv->as_int;
1862           }
1863
1864           if (cpi->sf.adaptive_motion_search)
1865             x->pred_mv[mbmi->ref_frame[0]].as_int = new_mv->as_int;
1866
1867           // restore src pointers
1868           mi_buf_restore(x, orig_src, orig_pre);
1869         }
1870
1871         if (has_second_rf) {
1872           if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV ||
1873               seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV)
1874             continue;
1875         }
1876
1877         if (has_second_rf && this_mode == NEWMV &&
1878             mbmi->interp_filter == EIGHTTAP) {
1879           // adjust src pointers
1880           mi_buf_shift(x, i);
1881           if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
1882             int rate_mv;
1883             joint_motion_search(cpi, x, bsize, frame_mv[this_mode],
1884                                 mi_row, mi_col, seg_mvs[i],
1885                                 &rate_mv);
1886             seg_mvs[i][mbmi->ref_frame[0]].as_int =
1887                 frame_mv[this_mode][mbmi->ref_frame[0]].as_int;
1888             seg_mvs[i][mbmi->ref_frame[1]].as_int =
1889                 frame_mv[this_mode][mbmi->ref_frame[1]].as_int;
1890           }
1891           // restore src pointers
1892           mi_buf_restore(x, orig_src, orig_pre);
1893         }
1894
1895         bsi->rdstat[i][mode_idx].brate =
1896             labels2mode(x, i, this_mode, &mode_mv[this_mode],
1897                         &second_mode_mv[this_mode], frame_mv, seg_mvs[i],
1898                         bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
1899                         x->mvcost, cpi);
1900
1901
1902         bsi->rdstat[i][mode_idx].mvs[0].as_int = mode_mv[this_mode].as_int;
1903         if (num_4x4_blocks_wide > 1)
1904           bsi->rdstat[i + 1][mode_idx].mvs[0].as_int =
1905               mode_mv[this_mode].as_int;
1906         if (num_4x4_blocks_high > 1)
1907           bsi->rdstat[i + 2][mode_idx].mvs[0].as_int =
1908               mode_mv[this_mode].as_int;
1909         if (has_second_rf) {
1910           bsi->rdstat[i][mode_idx].mvs[1].as_int =
1911               second_mode_mv[this_mode].as_int;
1912           if (num_4x4_blocks_wide > 1)
1913             bsi->rdstat[i + 1][mode_idx].mvs[1].as_int =
1914                 second_mode_mv[this_mode].as_int;
1915           if (num_4x4_blocks_high > 1)
1916             bsi->rdstat[i + 2][mode_idx].mvs[1].as_int =
1917                 second_mode_mv[this_mode].as_int;
1918         }
1919
1920         // Trap vectors that reach beyond the UMV borders
1921         if (mv_check_bounds(x, &mode_mv[this_mode].as_mv) ||
1922             (has_second_rf &&
1923              mv_check_bounds(x, &second_mode_mv[this_mode].as_mv)))
1924           continue;
1925
1926         if (filter_idx > 0) {
1927           BEST_SEG_INFO *ref_bsi = bsi_buf;
1928           subpelmv = mv_has_subpel(&mode_mv[this_mode].as_mv);
1929           have_ref = mode_mv[this_mode].as_int ==
1930                          ref_bsi->rdstat[i][mode_idx].mvs[0].as_int;
1931           if (has_second_rf) {
1932             subpelmv |= mv_has_subpel(&second_mode_mv[this_mode].as_mv);
1933             have_ref &= second_mode_mv[this_mode].as_int ==
1934                             ref_bsi->rdstat[i][mode_idx].mvs[1].as_int;
1935           }
1936
1937           if (filter_idx > 1 && !subpelmv && !have_ref) {
1938             ref_bsi = bsi_buf + 1;
1939             have_ref = mode_mv[this_mode].as_int ==
1940                        ref_bsi->rdstat[i][mode_idx].mvs[0].as_int;
1941             if (has_second_rf) {
1942               have_ref  &= second_mode_mv[this_mode].as_int ==
1943                            ref_bsi->rdstat[i][mode_idx].mvs[1].as_int;
1944             }
1945           }
1946
1947           if (!subpelmv && have_ref &&
1948               ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
1949             vpx_memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
1950                        sizeof(SEG_RDSTAT));
1951             if (num_4x4_blocks_wide > 1)
1952               bsi->rdstat[i + 1][mode_idx].eobs =
1953                   ref_bsi->rdstat[i + 1][mode_idx].eobs;
1954             if (num_4x4_blocks_high > 1)
1955               bsi->rdstat[i + 2][mode_idx].eobs =
1956                   ref_bsi->rdstat[i + 2][mode_idx].eobs;
1957
1958             if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
1959               mode_selected = this_mode;
1960               best_rd = bsi->rdstat[i][mode_idx].brdcost;
1961             }
1962             continue;
1963           }
1964         }
1965
1966         bsi->rdstat[i][mode_idx].brdcost =
1967             encode_inter_mb_segment(cpi, x,
1968                                     bsi->segment_rd - this_segment_rd, i,
1969                                     &bsi->rdstat[i][mode_idx].byrate,
1970                                     &bsi->rdstat[i][mode_idx].bdist,
1971                                     &bsi->rdstat[i][mode_idx].bsse,
1972                                     bsi->rdstat[i][mode_idx].ta,
1973                                     bsi->rdstat[i][mode_idx].tl,
1974                                     mi_row, mi_col);
1975         if (bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
1976           bsi->rdstat[i][mode_idx].brdcost += RDCOST(x->rdmult, x->rddiv,
1977                                             bsi->rdstat[i][mode_idx].brate, 0);
1978           bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate;
1979           bsi->rdstat[i][mode_idx].eobs = p->eobs[i];
1980           if (num_4x4_blocks_wide > 1)
1981             bsi->rdstat[i + 1][mode_idx].eobs = p->eobs[i + 1];
1982           if (num_4x4_blocks_high > 1)
1983             bsi->rdstat[i + 2][mode_idx].eobs = p->eobs[i + 2];
1984         }
1985
1986         if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
1987           mode_selected = this_mode;
1988           best_rd = bsi->rdstat[i][mode_idx].brdcost;
1989         }
1990       } /*for each 4x4 mode*/
1991
1992       if (best_rd == INT64_MAX) {
1993         int iy, midx;
1994         for (iy = i + 1; iy < 4; ++iy)
1995           for (midx = 0; midx < INTER_MODES; ++midx)
1996             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
1997         bsi->segment_rd = INT64_MAX;
1998         return;
1999       }
2000
2001       mode_idx = INTER_OFFSET(mode_selected);
2002       vpx_memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above));
2003       vpx_memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left));
2004
2005       labels2mode(x, i, mode_selected, &mode_mv[mode_selected],
2006                   &second_mode_mv[mode_selected], frame_mv, seg_mvs[i],
2007                   bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
2008                   x->mvcost, cpi);
2009
2010       br += bsi->rdstat[i][mode_idx].brate;
2011       bd += bsi->rdstat[i][mode_idx].bdist;
2012       block_sse += bsi->rdstat[i][mode_idx].bsse;
2013       segmentyrate += bsi->rdstat[i][mode_idx].byrate;
2014       this_segment_rd += bsi->rdstat[i][mode_idx].brdcost;
2015
2016       if (this_segment_rd > bsi->segment_rd) {
2017         int iy, midx;
2018         for (iy = i + 1; iy < 4; ++iy)
2019           for (midx = 0; midx < INTER_MODES; ++midx)
2020             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
2021         bsi->segment_rd = INT64_MAX;
2022         return;
2023       }
2024     }
2025   } /* for each label */
2026
2027   bsi->r = br;
2028   bsi->d = bd;
2029   bsi->segment_yrate = segmentyrate;
2030   bsi->segment_rd = this_segment_rd;
2031   bsi->sse = block_sse;
2032
2033   // update the coding decisions
2034   for (k = 0; k < 4; ++k)
2035     bsi->modes[k] = mi->bmi[k].as_mode;
2036 }
2037
2038 static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
2039                                            const TileInfo *const tile,
2040                                            int_mv *best_ref_mv,
2041                                            int_mv *second_best_ref_mv,
2042                                            int64_t best_rd,
2043                                            int *returntotrate,
2044                                            int *returnyrate,
2045                                            int64_t *returndistortion,
2046                                            int *skippable, int64_t *psse,
2047                                            int mvthresh,
2048                                            int_mv seg_mvs[4][MAX_REF_FRAMES],
2049                                            BEST_SEG_INFO *bsi_buf,
2050                                            int filter_idx,
2051                                            int mi_row, int mi_col) {
2052   int i;
2053   BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
2054   MACROBLOCKD *xd = &x->e_mbd;
2055   MODE_INFO *mi = xd->mi_8x8[0];
2056   MB_MODE_INFO *mbmi = &mi->mbmi;
2057   int mode_idx;
2058
2059   vp9_zero(*bsi);
2060
2061   bsi->segment_rd = best_rd;
2062   bsi->ref_mv = best_ref_mv;
2063   bsi->second_ref_mv = second_best_ref_mv;
2064   bsi->mvp.as_int = best_ref_mv->as_int;
2065   bsi->mvthresh = mvthresh;
2066
2067   for (i = 0; i < 4; i++)
2068     bsi->modes[i] = ZEROMV;
2069
2070   rd_check_segment_txsize(cpi, x, tile, bsi_buf, filter_idx, seg_mvs,
2071                           mi_row, mi_col);
2072
2073   if (bsi->segment_rd > best_rd)
2074     return INT64_MAX;
2075   /* set it to the best */
2076   for (i = 0; i < 4; i++) {
2077     mode_idx = INTER_OFFSET(bsi->modes[i]);
2078     mi->bmi[i].as_mv[0].as_int = bsi->rdstat[i][mode_idx].mvs[0].as_int;
2079     if (has_second_ref(mbmi))
2080       mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int;
2081     x->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs;
2082     mi->bmi[i].as_mode = bsi->modes[i];
2083   }
2084
2085   /*
2086    * used to set mbmi->mv.as_int
2087    */
2088   *returntotrate = bsi->r;
2089   *returndistortion = bsi->d;
2090   *returnyrate = bsi->segment_yrate;
2091   *skippable = vp9_is_skippable_in_plane(x, BLOCK_8X8, 0);
2092   *psse = bsi->sse;
2093   mbmi->mode = bsi->modes[3];
2094
2095   return bsi->segment_rd;
2096 }
2097
2098 static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
2099                     uint8_t *ref_y_buffer, int ref_y_stride,
2100                     int ref_frame, BLOCK_SIZE block_size ) {
2101   MACROBLOCKD *xd = &x->e_mbd;
2102   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
2103   int_mv this_mv;
2104   int i;
2105   int zero_seen = 0;
2106   int best_index = 0;
2107   int best_sad = INT_MAX;
2108   int this_sad = INT_MAX;
2109   int max_mv = 0;
2110
2111   uint8_t *src_y_ptr = x->plane[0].src.buf;
2112   uint8_t *ref_y_ptr;
2113   int row_offset, col_offset;
2114   int num_mv_refs = MAX_MV_REF_CANDIDATES +
2115                     (cpi->sf.adaptive_motion_search &&
2116                      cpi->common.show_frame &&
2117                      block_size < cpi->sf.max_partition_size);
2118
2119   int_mv pred_mv[3];
2120   pred_mv[0] = mbmi->ref_mvs[ref_frame][0];
2121   pred_mv[1] = mbmi->ref_mvs[ref_frame][1];
2122   pred_mv[2] = x->pred_mv[ref_frame];
2123
2124   // Get the sad for each candidate reference mv
2125   for (i = 0; i < num_mv_refs; i++) {
2126     this_mv.as_int = pred_mv[i].as_int;
2127
2128     max_mv = MAX(max_mv,
2129                  MAX(abs(this_mv.as_mv.row), abs(this_mv.as_mv.col)) >> 3);
2130     // only need to check zero mv once
2131     if (!this_mv.as_int && zero_seen) {
2132       x->mode_sad[ref_frame][i] = x->mode_sad[ref_frame][INTER_OFFSET(ZEROMV)];
2133       continue;
2134     }
2135     zero_seen = zero_seen || !this_mv.as_int;
2136
2137     row_offset = this_mv.as_mv.row >> 3;
2138     col_offset = this_mv.as_mv.col >> 3;
2139     ref_y_ptr = ref_y_buffer + (ref_y_stride * row_offset) + col_offset;
2140
2141     // Find sad for current vector.
2142     this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
2143                                            ref_y_ptr, ref_y_stride,
2144                                            0x7fffffff);
2145     x->mode_sad[ref_frame][i] = this_sad;
2146     if (this_mv.as_int == 0)
2147       x->mode_sad[ref_frame][INTER_OFFSET(ZEROMV)] = this_sad;
2148
2149     // Note if it is the best so far.
2150     if (this_sad < best_sad) {
2151       best_sad = this_sad;
2152       best_index = i;
2153     }
2154   }
2155
2156   if (!zero_seen)
2157     x->mode_sad[ref_frame][INTER_OFFSET(ZEROMV)] =
2158         cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
2159                                     ref_y_buffer, ref_y_stride,
2160                                     0x7fffffff);
2161
2162   // Note the index of the mv that worked best in the reference list.
2163   x->mv_best_ref_index[ref_frame] = best_index;
2164   x->max_mv_context[ref_frame] = max_mv;
2165   x->pred_mv_sad[ref_frame] = best_sad;
2166 }
2167
2168 static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
2169                                      unsigned int *ref_costs_single,
2170                                      unsigned int *ref_costs_comp,
2171                                      vp9_prob *comp_mode_p) {
2172   VP9_COMMON *const cm = &cpi->common;
2173   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
2174   int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id,
2175                                              SEG_LVL_REF_FRAME);
2176   if (seg_ref_active) {
2177     vpx_memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single));
2178     vpx_memset(ref_costs_comp,   0, MAX_REF_FRAMES * sizeof(*ref_costs_comp));
2179     *comp_mode_p = 128;
2180   } else {
2181     vp9_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
2182     vp9_prob comp_inter_p = 128;
2183
2184     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
2185       comp_inter_p = vp9_get_reference_mode_prob(cm, xd);
2186       *comp_mode_p = comp_inter_p;
2187     } else {
2188       *comp_mode_p = 128;
2189     }
2190
2191     ref_costs_single[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
2192
2193     if (cm->reference_mode != COMPOUND_REFERENCE) {
2194       vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
2195       vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
2196       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
2197
2198       if (cm->reference_mode == REFERENCE_MODE_SELECT)
2199         base_cost += vp9_cost_bit(comp_inter_p, 0);
2200
2201       ref_costs_single[LAST_FRAME] = ref_costs_single[GOLDEN_FRAME] =
2202           ref_costs_single[ALTREF_FRAME] = base_cost;
2203       ref_costs_single[LAST_FRAME]   += vp9_cost_bit(ref_single_p1, 0);
2204       ref_costs_single[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p1, 1);
2205       ref_costs_single[ALTREF_FRAME] += vp9_cost_bit(ref_single_p1, 1);
2206       ref_costs_single[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p2, 0);
2207       ref_costs_single[ALTREF_FRAME] += vp9_cost_bit(ref_single_p2, 1);
2208     } else {
2209       ref_costs_single[LAST_FRAME]   = 512;
2210       ref_costs_single[GOLDEN_FRAME] = 512;
2211       ref_costs_single[ALTREF_FRAME] = 512;
2212     }
2213     if (cm->reference_mode != SINGLE_REFERENCE) {
2214       vp9_prob ref_comp_p = vp9_get_pred_prob_comp_ref_p(cm, xd);
2215       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
2216
2217       if (cm->reference_mode == REFERENCE_MODE_SELECT)
2218         base_cost += vp9_cost_bit(comp_inter_p, 1);
2219
2220       ref_costs_comp[LAST_FRAME]   = base_cost + vp9_cost_bit(ref_comp_p, 0);
2221       ref_costs_comp[GOLDEN_FRAME] = base_cost + vp9_cost_bit(ref_comp_p, 1);
2222     } else {
2223       ref_costs_comp[LAST_FRAME]   = 512;
2224       ref_costs_comp[GOLDEN_FRAME] = 512;
2225     }
2226   }
2227 }
2228
2229 static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
2230                          int mode_index,
2231                          int_mv *ref_mv,
2232                          int_mv *second_ref_mv,
2233                          int64_t comp_pred_diff[REFERENCE_MODES],
2234                          int64_t tx_size_diff[TX_MODES],
2235                          int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]) {
2236   MACROBLOCKD *const xd = &x->e_mbd;
2237
2238   // Take a snapshot of the coding context so it can be
2239   // restored if we decide to encode this way
2240   ctx->skip = x->skip;
2241   ctx->best_mode_index = mode_index;
2242   ctx->mic = *xd->mi_8x8[0];
2243
2244   ctx->best_ref_mv[0].as_int = ref_mv->as_int;
2245   ctx->best_ref_mv[1].as_int = second_ref_mv->as_int;
2246
2247   ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
2248   ctx->comp_pred_diff   = (int)comp_pred_diff[COMPOUND_REFERENCE];
2249   ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
2250
2251   vpx_memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
2252   vpx_memcpy(ctx->best_filter_diff, best_filter_diff,
2253              sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS);
2254 }
2255
2256 static void setup_pred_block(const MACROBLOCKD *xd,
2257                              struct buf_2d dst[MAX_MB_PLANE],
2258                              const YV12_BUFFER_CONFIG *src,
2259                              int mi_row, int mi_col,
2260                              const struct scale_factors *scale,
2261                              const struct scale_factors *scale_uv) {
2262   int i;
2263
2264   dst[0].buf = src->y_buffer;
2265   dst[0].stride = src->y_stride;
2266   dst[1].buf = src->u_buffer;
2267   dst[2].buf = src->v_buffer;
2268   dst[1].stride = dst[2].stride = src->uv_stride;
2269 #if CONFIG_ALPHA
2270   dst[3].buf = src->alpha_buffer;
2271   dst[3].stride = src->alpha_stride;
2272 #endif
2273
2274   // TODO(jkoleszar): Make scale factors per-plane data
2275   for (i = 0; i < MAX_MB_PLANE; i++) {
2276     setup_pred_plane(dst + i, dst[i].buf, dst[i].stride, mi_row, mi_col,
2277                      i ? scale_uv : scale,
2278                      xd->plane[i].subsampling_x, xd->plane[i].subsampling_y);
2279   }
2280 }
2281
2282 void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
2283                             const TileInfo *const tile,
2284                             MV_REFERENCE_FRAME ref_frame,
2285                             BLOCK_SIZE block_size,
2286                             int mi_row, int mi_col,
2287                             int_mv frame_nearest_mv[MAX_REF_FRAMES],
2288                             int_mv frame_near_mv[MAX_REF_FRAMES],
2289                             struct buf_2d yv12_mb[4][MAX_MB_PLANE]) {
2290   const VP9_COMMON *cm = &cpi->common;
2291   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
2292   MACROBLOCKD *const xd = &x->e_mbd;
2293   MODE_INFO *const mi = xd->mi_8x8[0];
2294   int_mv *const candidates = mi->mbmi.ref_mvs[ref_frame];
2295   const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
2296
2297   // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
2298   // use the UV scaling factors.
2299   setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
2300
2301   // Gets an initial list of candidate vectors from neighbours and orders them
2302   vp9_find_mv_refs(cm, xd, tile, mi, xd->last_mi, ref_frame, candidates,
2303                    mi_row, mi_col);
2304
2305   // Candidate refinement carried out at encoder and decoder
2306   vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
2307                         &frame_nearest_mv[ref_frame],
2308                         &frame_near_mv[ref_frame]);
2309
2310   // Further refinement that is encode side only to test the top few candidates
2311   // in full and choose the best as the centre point for subsequent searches.
2312   // The current implementation doesn't support scaling.
2313   if (!vp9_is_scaled(sf) && block_size >= BLOCK_8X8)
2314     mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride,
2315             ref_frame, block_size);
2316 }
2317
2318 const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
2319                                                    int ref_frame) {
2320   const VP9_COMMON *const cm = &cpi->common;
2321   const int ref_idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
2322   const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1];
2323   return (scaled_idx != ref_idx) ? &cm->frame_bufs[scaled_idx].buf : NULL;
2324 }
2325
2326 static INLINE int get_switchable_rate(const MACROBLOCK *x) {
2327   const MACROBLOCKD *const xd = &x->e_mbd;
2328   const MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
2329   const int ctx = vp9_get_pred_context_switchable_interp(xd);
2330   return SWITCHABLE_INTERP_RATE_FACTOR *
2331              x->switchable_interp_costs[ctx][mbmi->interp_filter];
2332 }
2333
2334 static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
2335                                  const TileInfo *const tile,
2336                                  BLOCK_SIZE bsize,
2337                                  int mi_row, int mi_col,
2338                                  int_mv *tmp_mv, int *rate_mv) {
2339   MACROBLOCKD *xd = &x->e_mbd;
2340   VP9_COMMON *cm = &cpi->common;
2341   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
2342   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
2343   int bestsme = INT_MAX;
2344   int further_steps, step_param;
2345   int sadpb = x->sadperbit16;
2346   MV mvp_full;
2347   int ref = mbmi->ref_frame[0];
2348   int_mv ref_mv = mbmi->ref_mvs[ref][0];
2349
2350   int tmp_col_min = x->mv_col_min;
2351   int tmp_col_max = x->mv_col_max;
2352   int tmp_row_min = x->mv_row_min;
2353   int tmp_row_max = x->mv_row_max;
2354
2355   const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
2356                                                                         ref);
2357
2358   int_mv pred_mv[3];
2359   pred_mv[0] = mbmi->ref_mvs[ref][0];
2360   pred_mv[1] = mbmi->ref_mvs[ref][1];
2361   pred_mv[2] = x->pred_mv[ref];
2362
2363   if (scaled_ref_frame) {
2364     int i;
2365     // Swap out the reference frame for a version that's been scaled to
2366     // match the resolution of the current frame, allowing the existing
2367     // motion search code to be used without additional modifications.
2368     for (i = 0; i < MAX_MB_PLANE; i++)
2369       backup_yv12[i] = xd->plane[i].pre[0];
2370
2371     setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
2372   }
2373
2374   vp9_set_mv_search_range(x, &ref_mv.as_mv);
2375
2376   // Work out the size of the first step in the mv step search.
2377   // 0 here is maximum length first step. 1 is MAX >> 1 etc.
2378   if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) {
2379     // Take wtd average of the step_params based on the last frame's
2380     // max mv magnitude and that based on the best ref mvs of the current
2381     // block for the given reference.
2382     step_param = (vp9_init_search_range(cpi, x->max_mv_context[ref]) +
2383                   cpi->mv_step_param) >> 1;
2384   } else {
2385     step_param = cpi->mv_step_param;
2386   }
2387
2388   if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64 &&
2389       cpi->common.show_frame) {
2390     int boffset = 2 * (b_width_log2(BLOCK_64X64) - MIN(b_height_log2(bsize),
2391                                                        b_width_log2(bsize)));
2392     step_param = MAX(step_param, boffset);
2393   }
2394
2395   if (cpi->sf.adaptive_motion_search) {
2396     int bwl = b_width_log2_lookup[bsize];
2397     int bhl = b_height_log2_lookup[bsize];
2398     int i;
2399     int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
2400
2401     if (tlevel < 5)
2402       step_param += 2;
2403
2404     for (i = LAST_FRAME; i <= ALTREF_FRAME && cpi->common.show_frame; ++i) {
2405       if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
2406         x->pred_mv[ref].as_int = 0;
2407         tmp_mv->as_int = INVALID_MV;
2408
2409         if (scaled_ref_frame) {
2410           int i;
2411           for (i = 0; i < MAX_MB_PLANE; i++)
2412             xd->plane[i].pre[0] = backup_yv12[i];
2413         }
2414         return;
2415       }
2416     }
2417   }
2418
2419   mvp_full = pred_mv[x->mv_best_ref_index[ref]].as_mv;
2420
2421   mvp_full.col >>= 3;
2422   mvp_full.row >>= 3;
2423
2424   // Further step/diamond searches as necessary
2425   further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
2426
2427   if (cpi->sf.search_method == HEX) {
2428     bestsme = vp9_hex_search(x, &mvp_full, step_param, sadpb, 1,
2429                              &cpi->fn_ptr[bsize], 1,
2430                              &ref_mv.as_mv, &tmp_mv->as_mv);
2431   } else if (cpi->sf.search_method == SQUARE) {
2432     bestsme = vp9_square_search(x, &mvp_full, step_param, sadpb, 1,
2433                                 &cpi->fn_ptr[bsize], 1,
2434                                 &ref_mv.as_mv, &tmp_mv->as_mv);
2435   } else if (cpi->sf.search_method == BIGDIA) {
2436     bestsme = vp9_bigdia_search(x, &mvp_full, step_param, sadpb, 1,
2437                                 &cpi->fn_ptr[bsize], 1,
2438                                 &ref_mv.as_mv, &tmp_mv->as_mv);
2439   } else {
2440     bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
2441                                      sadpb, further_steps, 1,
2442                                      &cpi->fn_ptr[bsize],
2443                                      &ref_mv.as_mv, &tmp_mv->as_mv);
2444   }
2445
2446   x->mv_col_min = tmp_col_min;
2447   x->mv_col_max = tmp_col_max;
2448   x->mv_row_min = tmp_row_min;
2449   x->mv_row_max = tmp_row_max;
2450
2451   if (bestsme < INT_MAX) {
2452     int dis;  /* TODO: use dis in distortion calculation later. */
2453     cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv.as_mv,
2454                                  cm->allow_high_precision_mv,
2455                                  x->errorperbit,
2456                                  &cpi->fn_ptr[bsize],
2457                                  cpi->sf.subpel_force_stop,
2458                                  cpi->sf.subpel_iters_per_step,
2459                                  x->nmvjointcost, x->mvcost,
2460                                  &dis, &x->pred_sse[ref]);
2461   }
2462   *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv.as_mv,
2463                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
2464
2465   if (cpi->sf.adaptive_motion_search && cpi->common.show_frame)
2466     x->pred_mv[ref].as_int = tmp_mv->as_int;
2467
2468   if (scaled_ref_frame) {
2469     int i;
2470     for (i = 0; i < MAX_MB_PLANE; i++)
2471       xd->plane[i].pre[0] = backup_yv12[i];
2472   }
2473 }
2474
2475 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
2476                                 BLOCK_SIZE bsize,
2477                                 int_mv *frame_mv,
2478                                 int mi_row, int mi_col,
2479                                 int_mv single_newmv[MAX_REF_FRAMES],
2480                                 int *rate_mv) {
2481   const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
2482   const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
2483   MACROBLOCKD *xd = &x->e_mbd;
2484   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
2485   const int refs[2] = { mbmi->ref_frame[0],
2486                         mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
2487   int_mv ref_mv[2];
2488   int ite, ref;
2489   // Prediction buffer from second frame.
2490   uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
2491
2492   // Do joint motion search in compound mode to get more accurate mv.
2493   struct buf_2d backup_yv12[2][MAX_MB_PLANE];
2494   struct buf_2d scaled_first_yv12 = xd->plane[0].pre[0];
2495   int last_besterr[2] = {INT_MAX, INT_MAX};
2496   const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
2497     vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]),
2498     vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[1])
2499   };
2500
2501   for (ref = 0; ref < 2; ++ref) {
2502     ref_mv[ref] = mbmi->ref_mvs[refs[ref]][0];
2503
2504     if (scaled_ref_frame[ref]) {
2505       int i;
2506       // Swap out the reference frame for a version that's been scaled to
2507       // match the resolution of the current frame, allowing the existing
2508       // motion search code to be used without additional modifications.
2509       for (i = 0; i < MAX_MB_PLANE; i++)
2510         backup_yv12[ref][i] = xd->plane[i].pre[ref];
2511       setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col, NULL);
2512     }
2513
2514     frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int;
2515   }
2516
2517   // Allow joint search multiple times iteratively for each ref frame
2518   // and break out the search loop if it couldn't find better mv.
2519   for (ite = 0; ite < 4; ite++) {
2520     struct buf_2d ref_yv12[2];
2521     int bestsme = INT_MAX;
2522     int sadpb = x->sadperbit16;
2523     int_mv tmp_mv;
2524     int search_range = 3;
2525
2526     int tmp_col_min = x->mv_col_min;
2527     int tmp_col_max = x->mv_col_max;
2528     int tmp_row_min = x->mv_row_min;
2529     int tmp_row_max = x->mv_row_max;
2530     int id = ite % 2;
2531
2532     // Initialized here because of compiler problem in Visual Studio.
2533     ref_yv12[0] = xd->plane[0].pre[0];
2534     ref_yv12[1] = xd->plane[0].pre[1];
2535
2536     // Get pred block from second frame.
2537     vp9_build_inter_predictor(ref_yv12[!id].buf,
2538                               ref_yv12[!id].stride,
2539                               second_pred, pw,
2540                               &frame_mv[refs[!id]].as_mv,
2541                               &xd->block_refs[!id]->sf,
2542                               pw, ph, 0,
2543                               xd->interp_kernel, MV_PRECISION_Q3,
2544                               mi_col * MI_SIZE, mi_row * MI_SIZE);
2545
2546     // Compound motion search on first ref frame.
2547     if (id)
2548       xd->plane[0].pre[0] = ref_yv12[id];
2549     vp9_set_mv_search_range(x, &ref_mv[id].as_mv);
2550
2551     // Use mv result from single mode as mvp.
2552     tmp_mv.as_int = frame_mv[refs[id]].as_int;
2553
2554     tmp_mv.as_mv.col >>= 3;
2555     tmp_mv.as_mv.row >>= 3;
2556
2557     // Small-range full-pixel motion search
2558     bestsme = vp9_refining_search_8p_c(x, &tmp_mv.as_mv, sadpb,
2559                                        search_range,
2560                                        &cpi->fn_ptr[bsize],
2561                                        x->nmvjointcost, x->mvcost,
2562                                        &ref_mv[id].as_mv, second_pred,
2563                                        pw, ph);
2564
2565     x->mv_col_min = tmp_col_min;
2566     x->mv_col_max = tmp_col_max;
2567     x->mv_row_min = tmp_row_min;
2568     x->mv_row_max = tmp_row_max;
2569
2570     if (bestsme < INT_MAX) {
2571       int dis; /* TODO: use dis in distortion calculation later. */
2572       unsigned int sse;
2573       bestsme = cpi->find_fractional_mv_step_comp(
2574           x, &tmp_mv.as_mv,
2575           &ref_mv[id].as_mv,
2576           cpi->common.allow_high_precision_mv,
2577           x->errorperbit,
2578           &cpi->fn_ptr[bsize],
2579           0, cpi->sf.subpel_iters_per_step,
2580           x->nmvjointcost, x->mvcost,
2581           &dis, &sse, second_pred,
2582           pw, ph);
2583     }
2584
2585     if (id)
2586       xd->plane[0].pre[0] = scaled_first_yv12;
2587
2588     if (bestsme < last_besterr[id]) {
2589       frame_mv[refs[id]].as_int = tmp_mv.as_int;
2590       last_besterr[id] = bestsme;
2591     } else {
2592       break;
2593     }
2594   }
2595
2596   *rate_mv = 0;
2597
2598   for (ref = 0; ref < 2; ++ref) {
2599     if (scaled_ref_frame[ref]) {
2600       // restore the predictor
2601       int i;
2602       for (i = 0; i < MAX_MB_PLANE; i++)
2603         xd->plane[i].pre[ref] = backup_yv12[ref][i];
2604     }
2605
2606     *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
2607                                 &mbmi->ref_mvs[refs[ref]][0].as_mv,
2608                                 x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
2609   }
2610
2611   vpx_free(second_pred);
2612 }
2613
2614 static INLINE void restore_dst_buf(MACROBLOCKD *xd,
2615                                    uint8_t *orig_dst[MAX_MB_PLANE],
2616                                    int orig_dst_stride[MAX_MB_PLANE]) {
2617   int i;
2618   for (i = 0; i < MAX_MB_PLANE; i++) {
2619     xd->plane[i].dst.buf = orig_dst[i];
2620     xd->plane[i].dst.stride = orig_dst_stride[i];
2621   }
2622 }
2623
2624 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
2625                                  const TileInfo *const tile,
2626                                  BLOCK_SIZE bsize,
2627                                  int64_t txfm_cache[],
2628                                  int *rate2, int64_t *distortion,
2629                                  int *skippable,
2630                                  int *rate_y, int64_t *distortion_y,
2631                                  int *rate_uv, int64_t *distortion_uv,
2632                                  int *mode_excluded, int *disable_skip,
2633                                  INTERP_FILTER *best_filter,
2634                                  int_mv (*mode_mv)[MAX_REF_FRAMES],
2635                                  int mi_row, int mi_col,
2636                                  int_mv single_newmv[MAX_REF_FRAMES],
2637                                  int64_t *psse,
2638                                  const int64_t ref_best_rd) {
2639   VP9_COMMON *cm = &cpi->common;
2640   MACROBLOCKD *xd = &x->e_mbd;
2641   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
2642   const int is_comp_pred = has_second_ref(mbmi);
2643   const int num_refs = is_comp_pred ? 2 : 1;
2644   const int this_mode = mbmi->mode;
2645   int_mv *frame_mv = mode_mv[this_mode];
2646   int i;
2647   int refs[2] = { mbmi->ref_frame[0],
2648     (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
2649   int_mv cur_mv[2];
2650   int64_t this_rd = 0;
2651   DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64);
2652   int pred_exists = 0;
2653   int intpel_mv;
2654   int64_t rd, best_rd = INT64_MAX;
2655   int best_needs_copy = 0;
2656   uint8_t *orig_dst[MAX_MB_PLANE];
2657   int orig_dst_stride[MAX_MB_PLANE];
2658   int rs = 0;
2659
2660   if (is_comp_pred) {
2661     if (frame_mv[refs[0]].as_int == INVALID_MV ||
2662         frame_mv[refs[1]].as_int == INVALID_MV)
2663       return INT64_MAX;
2664   }
2665
2666   if (this_mode == NEWMV) {
2667     int rate_mv;
2668     if (is_comp_pred) {
2669       // Initialize mv using single prediction mode result.
2670       frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
2671       frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
2672
2673       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
2674         joint_motion_search(cpi, x, bsize, frame_mv,
2675                             mi_row, mi_col, single_newmv, &rate_mv);
2676       } else {
2677         rate_mv  = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
2678                                    &mbmi->ref_mvs[refs[0]][0].as_mv,
2679                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
2680         rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv,
2681                                    &mbmi->ref_mvs[refs[1]][0].as_mv,
2682                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
2683       }
2684       *rate2 += rate_mv;
2685     } else {
2686       int_mv tmp_mv;
2687       single_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
2688                            &tmp_mv, &rate_mv);
2689       *rate2 += rate_mv;
2690       frame_mv[refs[0]].as_int =
2691           xd->mi_8x8[0]->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
2692       single_newmv[refs[0]].as_int = tmp_mv.as_int;
2693     }
2694   }
2695
2696   // if we're near/nearest and mv == 0,0, compare to zeromv
2697   if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
2698       frame_mv[refs[0]].as_int == 0 &&
2699       !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) &&
2700       (num_refs == 1 || frame_mv[refs[1]].as_int == 0)) {
2701     int rfc = mbmi->mode_context[mbmi->ref_frame[0]];
2702     int c1 = cost_mv_ref(cpi, NEARMV, rfc);
2703     int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
2704     int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
2705
2706     if (this_mode == NEARMV) {
2707       if (c1 > c3)
2708         return INT64_MAX;
2709     } else if (this_mode == NEARESTMV) {
2710       if (c2 > c3)
2711         return INT64_MAX;
2712     } else {
2713       assert(this_mode == ZEROMV);
2714       if (num_refs == 1) {
2715         if ((c3 >= c2 &&
2716              mode_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0) ||
2717             (c3 >= c1 &&
2718              mode_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0))
2719           return INT64_MAX;
2720       } else {
2721         if ((c3 >= c2 &&
2722              mode_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0 &&
2723              mode_mv[NEARESTMV][mbmi->ref_frame[1]].as_int == 0) ||
2724             (c3 >= c1 &&
2725              mode_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0 &&
2726              mode_mv[NEARMV][mbmi->ref_frame[1]].as_int == 0))
2727           return INT64_MAX;
2728       }
2729     }
2730   }
2731
2732   for (i = 0; i < num_refs; ++i) {
2733     cur_mv[i] = frame_mv[refs[i]];
2734     // Clip "next_nearest" so that it does not extend to far out of image
2735     if (this_mode != NEWMV)
2736       clamp_mv2(&cur_mv[i].as_mv, xd);
2737
2738     if (mv_check_bounds(x, &cur_mv[i].as_mv))
2739       return INT64_MAX;
2740     mbmi->mv[i].as_int = cur_mv[i].as_int;
2741   }
2742
2743   // do first prediction into the destination buffer. Do the next
2744   // prediction into a temporary buffer. Then keep track of which one
2745   // of these currently holds the best predictor, and use the other
2746   // one for future predictions. In the end, copy from tmp_buf to
2747   // dst if necessary.
2748   for (i = 0; i < MAX_MB_PLANE; i++) {
2749     orig_dst[i] = xd->plane[i].dst.buf;
2750     orig_dst_stride[i] = xd->plane[i].dst.stride;
2751   }
2752
2753   /* We don't include the cost of the second reference here, because there
2754    * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
2755    * words if you present them in that order, the second one is always known
2756    * if the first is known */
2757   *rate2 += cost_mv_ref(cpi, this_mode,
2758                         mbmi->mode_context[mbmi->ref_frame[0]]);
2759
2760   if (!(*mode_excluded))
2761     *mode_excluded = is_comp_pred ? cm->reference_mode == SINGLE_REFERENCE
2762                                   : cm->reference_mode == COMPOUND_REFERENCE;
2763
2764   pred_exists = 0;
2765   // Are all MVs integer pel for Y and UV
2766   intpel_mv = !mv_has_subpel(&mbmi->mv[0].as_mv);
2767   if (is_comp_pred)
2768     intpel_mv &= !mv_has_subpel(&mbmi->mv[1].as_mv);
2769
2770   // Search for best switchable filter by checking the variance of
2771   // pred error irrespective of whether the filter will be used
2772   cpi->mask_filter_rd = 0;
2773   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
2774     cpi->rd_filter_cache[i] = INT64_MAX;
2775
2776   if (cm->interp_filter != BILINEAR) {
2777     *best_filter = EIGHTTAP;
2778     if (x->source_variance <
2779         cpi->sf.disable_filter_search_var_thresh) {
2780       *best_filter = EIGHTTAP;
2781     } else {
2782       int newbest;
2783       int tmp_rate_sum = 0;
2784       int64_t tmp_dist_sum = 0;
2785
2786       for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
2787         int j;
2788         int64_t rs_rd;
2789         mbmi->interp_filter = i;
2790         xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
2791         rs = get_switchable_rate(x);
2792         rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
2793
2794         if (i > 0 && intpel_mv) {
2795           rd = RDCOST(x->rdmult, x->rddiv, tmp_rate_sum, tmp_dist_sum);
2796           cpi->rd_filter_cache[i] = rd;
2797           cpi->rd_filter_cache[SWITCHABLE_FILTERS] =
2798               MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
2799           if (cm->interp_filter == SWITCHABLE)
2800             rd += rs_rd;
2801           cpi->mask_filter_rd = MAX(cpi->mask_filter_rd, rd);
2802         } else {
2803           int rate_sum = 0;
2804           int64_t dist_sum = 0;
2805           if ((cm->interp_filter == SWITCHABLE &&
2806                (!i || best_needs_copy)) ||
2807               (cm->interp_filter != SWITCHABLE &&
2808                (cm->interp_filter == mbmi->interp_filter ||
2809                 (i == 0 && intpel_mv)))) {
2810             restore_dst_buf(xd, orig_dst, orig_dst_stride);
2811           } else {
2812             for (j = 0; j < MAX_MB_PLANE; j++) {
2813               xd->plane[j].dst.buf = tmp_buf + j * 64 * 64;
2814               xd->plane[j].dst.stride = 64;
2815             }
2816           }
2817           vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
2818           model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);
2819
2820           rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
2821           cpi->rd_filter_cache[i] = rd;
2822           cpi->rd_filter_cache[SWITCHABLE_FILTERS] =
2823               MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
2824           if (cm->interp_filter == SWITCHABLE)
2825             rd += rs_rd;
2826           cpi->mask_filter_rd = MAX(cpi->mask_filter_rd, rd);
2827
2828           if (i == 0 && intpel_mv) {
2829             tmp_rate_sum = rate_sum;
2830             tmp_dist_sum = dist_sum;
2831           }
2832         }
2833
2834         if (i == 0 && cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
2835           if (rd / 2 > ref_best_rd) {
2836             restore_dst_buf(xd, orig_dst, orig_dst_stride);
2837             return INT64_MAX;
2838           }
2839         }
2840         newbest = i == 0 || rd < best_rd;
2841
2842         if (newbest) {
2843           best_rd = rd;
2844           *best_filter = mbmi->interp_filter;
2845           if (cm->interp_filter == SWITCHABLE && i && !intpel_mv)
2846             best_needs_copy = !best_needs_copy;
2847         }
2848
2849         if ((cm->interp_filter == SWITCHABLE && newbest) ||
2850             (cm->interp_filter != SWITCHABLE &&
2851              cm->interp_filter == mbmi->interp_filter)) {
2852           pred_exists = 1;
2853         }
2854       }
2855       restore_dst_buf(xd, orig_dst, orig_dst_stride);
2856     }
2857   }
2858   // Set the appropriate filter
2859   mbmi->interp_filter = cm->interp_filter != SWITCHABLE ?
2860       cm->interp_filter : *best_filter;
2861   xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
2862   rs = cm->interp_filter == SWITCHABLE ? get_switchable_rate(x) : 0;
2863
2864   if (pred_exists) {
2865     if (best_needs_copy) {
2866       // again temporarily set the buffers to local memory to prevent a memcpy
2867       for (i = 0; i < MAX_MB_PLANE; i++) {
2868         xd->plane[i].dst.buf = tmp_buf + i * 64 * 64;
2869         xd->plane[i].dst.stride = 64;
2870       }
2871     }
2872   } else {
2873     // Handles the special case when a filter that is not in the
2874     // switchable list (ex. bilinear, 6-tap) is indicated at the frame level
2875     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
2876   }
2877
2878   if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
2879     int tmp_rate;
2880     int64_t tmp_dist;
2881     model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist);
2882     rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
2883     // if current pred_error modeled rd is substantially more than the best
2884     // so far, do not bother doing full rd
2885     if (rd / 2 > ref_best_rd) {
2886       restore_dst_buf(xd, orig_dst, orig_dst_stride);
2887       return INT64_MAX;
2888     }
2889   }
2890
2891   if (cm->interp_filter == SWITCHABLE)
2892     *rate2 += get_switchable_rate(x);
2893
2894   if (!is_comp_pred) {
2895     if (cpi->active_map_enabled && x->active_ptr[0] == 0)
2896       x->skip = 1;
2897     else if (cpi->allow_encode_breakout && x->encode_breakout) {
2898       const BLOCK_SIZE y_size = get_plane_block_size(bsize, &xd->plane[0]);
2899       const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
2900       unsigned int var, sse;
2901       // Skipping threshold for ac.
2902       unsigned int thresh_ac;
2903       // Set a maximum for threshold to avoid big PSNR loss in low bitrate case.
2904       // Use extreme low threshold for static frames to limit skipping.
2905       const unsigned int max_thresh = (cpi->allow_encode_breakout ==
2906                                       ENCODE_BREAKOUT_LIMITED) ? 128 : 36000;
2907       // The encode_breakout input
2908       const unsigned int min_thresh = ((x->encode_breakout << 4) > max_thresh) ?
2909                                       max_thresh : (x->encode_breakout << 4);
2910
2911       // Calculate threshold according to dequant value.
2912       thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
2913       thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);
2914
2915       var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride,
2916                                    xd->plane[0].dst.buf,
2917                                    xd->plane[0].dst.stride, &sse);
2918
2919       // Adjust threshold according to partition size.
2920       thresh_ac >>= 8 - (b_width_log2_lookup[bsize] +
2921           b_height_log2_lookup[bsize]);
2922
2923       // Y skipping condition checking
2924       if (sse < thresh_ac || sse == 0) {
2925         // Skipping threshold for dc
2926         unsigned int thresh_dc;
2927
2928         thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
2929
2930         // dc skipping checking
2931         if ((sse - var) < thresh_dc || sse == var) {
2932           unsigned int sse_u, sse_v;
2933           unsigned int var_u, var_v;
2934
2935           var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
2936                                           x->plane[1].src.stride,
2937                                           xd->plane[1].dst.buf,
2938                                           xd->plane[1].dst.stride, &sse_u);
2939
2940           // U skipping condition checking
2941           if ((sse_u * 4 < thresh_ac || sse_u == 0) &&
2942               (sse_u - var_u < thresh_dc || sse_u == var_u)) {
2943             var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
2944                                             x->plane[2].src.stride,
2945                                             xd->plane[2].dst.buf,
2946                                             xd->plane[2].dst.stride, &sse_v);
2947
2948             // V skipping condition checking
2949             if ((sse_v * 4 < thresh_ac || sse_v == 0) &&
2950                 (sse_v - var_v < thresh_dc || sse_v == var_v)) {
2951               x->skip = 1;
2952
2953               // The cost of skip bit needs to be added.
2954               *rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
2955
2956               // Scaling factor for SSE from spatial domain to frequency domain
2957               // is 16. Adjust distortion accordingly.
2958               *distortion_uv = (sse_u + sse_v) << 4;
2959               *distortion = (sse << 4) + *distortion_uv;
2960
2961               *disable_skip = 1;
2962               this_rd = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
2963             }
2964           }
2965         }
2966       }
2967     }
2968   }
2969
2970   if (!x->skip) {
2971     int skippable_y, skippable_uv;
2972     int64_t sseuv = INT64_MAX;
2973     int64_t rdcosty = INT64_MAX;
2974
2975     // Y cost and distortion
2976     super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
2977                     bsize, txfm_cache, ref_best_rd);
2978
2979     if (*rate_y == INT_MAX) {
2980       *rate2 = INT_MAX;
2981       *distortion = INT64_MAX;
2982       restore_dst_buf(xd, orig_dst, orig_dst_stride);
2983       return INT64_MAX;
2984     }
2985
2986     *rate2 += *rate_y;
2987     *distortion += *distortion_y;
2988
2989     rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
2990     rdcosty = MIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
2991
2992     super_block_uvrd(cpi, x, rate_uv, distortion_uv, &skippable_uv, &sseuv,
2993                      bsize, ref_best_rd - rdcosty);
2994     if (*rate_uv == INT_MAX) {
2995       *rate2 = INT_MAX;
2996       *distortion = INT64_MAX;
2997       restore_dst_buf(xd, orig_dst, orig_dst_stride);
2998       return INT64_MAX;
2999     }
3000
3001     *psse += sseuv;
3002     *rate2 += *rate_uv;
3003     *distortion += *distortion_uv;
3004     *skippable = skippable_y && skippable_uv;
3005   }
3006
3007   restore_dst_buf(xd, orig_dst, orig_dst_stride);
3008   return this_rd;  // if 0, this will be re-calculated by caller
3009 }
3010
3011 static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
3012                            int max_plane) {
3013   struct macroblock_plane *const p = x->plane;
3014   struct macroblockd_plane *const pd = x->e_mbd.plane;
3015   int i;
3016
3017   for (i = 0; i < max_plane; ++i) {
3018     p[i].coeff    = ctx->coeff_pbuf[i][1];
3019     p[i].qcoeff  = ctx->qcoeff_pbuf[i][1];
3020     pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
3021     p[i].eobs    = ctx->eobs_pbuf[i][1];
3022
3023     ctx->coeff_pbuf[i][1]   = ctx->coeff_pbuf[i][0];
3024     ctx->qcoeff_pbuf[i][1]  = ctx->qcoeff_pbuf[i][0];
3025     ctx->dqcoeff_pbuf[i][1] = ctx->dqcoeff_pbuf[i][0];
3026     ctx->eobs_pbuf[i][1]    = ctx->eobs_pbuf[i][0];
3027
3028     ctx->coeff_pbuf[i][0]   = p[i].coeff;
3029     ctx->qcoeff_pbuf[i][0]  = p[i].qcoeff;
3030     ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff;
3031     ctx->eobs_pbuf[i][0]    = p[i].eobs;
3032   }
3033 }
3034
3035 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
3036                                int *returnrate, int64_t *returndist,
3037                                BLOCK_SIZE bsize,
3038                                PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
3039   VP9_COMMON *const cm = &cpi->common;
3040   MACROBLOCKD *const xd = &x->e_mbd;
3041   int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
3042   int y_skip = 0, uv_skip = 0;
3043   int64_t dist_y = 0, dist_uv = 0, tx_cache[TX_MODES] = { 0 };
3044   TX_SIZE max_uv_tx_size;
3045   x->skip_encode = 0;
3046   ctx->skip = 0;
3047   xd->mi_8x8[0]->mbmi.ref_frame[0] = INTRA_FRAME;
3048
3049   if (bsize >= BLOCK_8X8) {
3050     if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
3051                                &dist_y, &y_skip, bsize, tx_cache,
3052                                best_rd) >= best_rd) {
3053       *returnrate = INT_MAX;
3054       return;
3055     }
3056     max_uv_tx_size = get_uv_tx_size_impl(xd->mi_8x8[0]->mbmi.tx_size, bsize);
3057     rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
3058                             &dist_uv, &uv_skip, bsize, max_uv_tx_size);
3059   } else {
3060     y_skip = 0;
3061     if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
3062                                      &dist_y, best_rd) >= best_rd) {
3063       *returnrate = INT_MAX;
3064       return;
3065     }
3066     max_uv_tx_size = get_uv_tx_size_impl(xd->mi_8x8[0]->mbmi.tx_size, bsize);
3067     rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
3068                             &dist_uv, &uv_skip, BLOCK_8X8, max_uv_tx_size);
3069   }
3070
3071   if (y_skip && uv_skip) {
3072     *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
3073                   vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
3074     *returndist = dist_y + dist_uv;
3075     vp9_zero(ctx->tx_rd_diff);
3076   } else {
3077     int i;
3078     *returnrate = rate_y + rate_uv + vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
3079     *returndist = dist_y + dist_uv;
3080     if (cpi->sf.tx_size_search_method == USE_FULL_RD)
3081       for (i = 0; i < TX_MODES; i++) {
3082         if (tx_cache[i] < INT64_MAX && tx_cache[cm->tx_mode] < INT64_MAX)
3083           ctx->tx_rd_diff[i] = tx_cache[i] - tx_cache[cm->tx_mode];
3084         else
3085           ctx->tx_rd_diff[i] = 0;
3086       }
3087   }
3088
3089   ctx->mic = *xd->mi_8x8[0];
3090 }
3091
3092 int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
3093                                   const TileInfo *const tile,
3094                                   int mi_row, int mi_col,
3095                                   int *returnrate,
3096                                   int64_t *returndistortion,
3097                                   BLOCK_SIZE bsize,
3098                                   PICK_MODE_CONTEXT *ctx,
3099                                   int64_t best_rd_so_far) {
3100   VP9_COMMON *cm = &cpi->common;
3101   MACROBLOCKD *xd = &x->e_mbd;
3102   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
3103   const struct segmentation *seg = &cm->seg;
3104   const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
3105   MB_PREDICTION_MODE this_mode;
3106   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
3107   unsigned char segment_id = mbmi->segment_id;
3108   int comp_pred, i;
3109   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
3110   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
3111   int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
3112   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
3113                                     VP9_ALT_FLAG };
3114   int64_t best_rd = best_rd_so_far;
3115   int64_t best_tx_rd[TX_MODES];
3116   int64_t best_tx_diff[TX_MODES];
3117   int64_t best_pred_diff[REFERENCE_MODES];
3118   int64_t best_pred_rd[REFERENCE_MODES];
3119   int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
3120   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
3121   MB_MODE_INFO best_mbmode = { 0 };
3122   int mode_index, best_mode_index = 0;
3123   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
3124   vp9_prob comp_mode_p;
3125   int64_t best_intra_rd = INT64_MAX;
3126   int64_t best_inter_rd = INT64_MAX;
3127   MB_PREDICTION_MODE best_intra_mode = DC_PRED;
3128   MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
3129   INTERP_FILTER tmp_best_filter = SWITCHABLE;
3130   int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
3131   int64_t dist_uv[TX_SIZES];
3132   int skip_uv[TX_SIZES];
3133   MB_PREDICTION_MODE mode_uv[TX_SIZES];
3134   int64_t mode_distortions[MB_MODE_COUNT] = {-1};
3135   int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q);
3136   const int bws = num_8x8_blocks_wide_lookup[bsize] / 2;
3137   const int bhs = num_8x8_blocks_high_lookup[bsize] / 2;
3138   int best_skip2 = 0;
3139
3140   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
3141
3142   // Everywhere the flag is set the error is much higher than its neighbors.
3143   ctx->modes_with_high_error = 0;
3144
3145   estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp,
3146                            &comp_mode_p);
3147
3148   for (i = 0; i < REFERENCE_MODES; ++i)
3149     best_pred_rd[i] = INT64_MAX;
3150   for (i = 0; i < TX_MODES; i++)
3151     best_tx_rd[i] = INT64_MAX;
3152   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
3153     best_filter_rd[i] = INT64_MAX;
3154   for (i = 0; i < TX_SIZES; i++)
3155     rate_uv_intra[i] = INT_MAX;
3156   for (i = 0; i < MAX_REF_FRAMES; ++i)
3157     x->pred_sse[i] = INT_MAX;
3158
3159   *returnrate = INT_MAX;
3160
3161   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
3162     x->pred_mv_sad[ref_frame] = INT_MAX;
3163     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
3164       vp9_setup_buffer_inter(cpi, x, tile,
3165                              ref_frame, block_size, mi_row, mi_col,
3166                              frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
3167     }
3168     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
3169     frame_mv[ZEROMV][ref_frame].as_int = 0;
3170   }
3171
3172   cpi->ref_frame_mask = 0;
3173   for (ref_frame = LAST_FRAME;
3174        ref_frame <= ALTREF_FRAME && cpi->sf.reference_masking; ++ref_frame) {
3175     int i;
3176     for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
3177       if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
3178         cpi->ref_frame_mask |= (1 << ref_frame);
3179         break;
3180       }
3181     }
3182   }
3183
3184   for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
3185     int mode_excluded = 0;
3186     int64_t this_rd = INT64_MAX;
3187     int disable_skip = 0;
3188     int compmode_cost = 0;
3189     int rate2 = 0, rate_y = 0, rate_uv = 0;
3190     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
3191     int skippable = 0;
3192     int64_t tx_cache[TX_MODES];
3193     int i;
3194     int this_skip2 = 0;
3195     int64_t total_sse = INT_MAX;
3196     int early_term = 0;
3197
3198     for (i = 0; i < TX_MODES; ++i)
3199       tx_cache[i] = INT64_MAX;
3200
3201     x->skip = 0;
3202     this_mode = vp9_mode_order[mode_index].mode;
3203     ref_frame = vp9_mode_order[mode_index].ref_frame[0];
3204     second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
3205
3206     // Look at the reference frame of the best mode so far and set the
3207     // skip mask to look at a subset of the remaining modes.
3208     if (mode_index > cpi->sf.mode_skip_start) {
3209       if (mode_index == (cpi->sf.mode_skip_start + 1)) {
3210         switch (vp9_mode_order[best_mode_index].ref_frame[0]) {
3211           case INTRA_FRAME:
3212             cpi->mode_skip_mask = 0;
3213             break;
3214           case LAST_FRAME:
3215             cpi->mode_skip_mask = LAST_FRAME_MODE_MASK;
3216             break;
3217           case GOLDEN_FRAME:
3218             cpi->mode_skip_mask = GOLDEN_FRAME_MODE_MASK;
3219             break;
3220           case ALTREF_FRAME:
3221             cpi->mode_skip_mask = ALT_REF_MODE_MASK;
3222             break;
3223           case NONE:
3224           case MAX_REF_FRAMES:
3225             assert(0 && "Invalid Reference frame");
3226         }
3227       }
3228       if (cpi->mode_skip_mask & ((int64_t)1 << mode_index))
3229         continue;
3230     }
3231
3232     // Skip if the current reference frame has been masked off
3233     if (cpi->ref_frame_mask & (1 << ref_frame) && this_mode != NEWMV)
3234       continue;
3235
3236     // Test best rd so far against threshold for trying this mode.
3237     if ((best_rd < ((int64_t)cpi->rd_threshes[segment_id][bsize][mode_index] *
3238                      cpi->rd_thresh_freq_fact[bsize][mode_index] >> 5)) ||
3239         cpi->rd_threshes[segment_id][bsize][mode_index] == INT_MAX)
3240       continue;
3241
3242     // Do not allow compound prediction if the segment level reference
3243     // frame feature is in use as in this case there can only be one reference.
3244     if ((second_ref_frame > INTRA_FRAME) &&
3245          vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
3246       continue;
3247
3248     mbmi->ref_frame[0] = ref_frame;
3249     mbmi->ref_frame[1] = second_ref_frame;
3250
3251     if (!(ref_frame == INTRA_FRAME
3252         || (cpi->ref_frame_flags & flag_list[ref_frame]))) {
3253       continue;
3254     }
3255     if (!(second_ref_frame == NONE
3256         || (cpi->ref_frame_flags & flag_list[second_ref_frame]))) {
3257       continue;
3258     }
3259
3260     comp_pred = second_ref_frame > INTRA_FRAME;
3261     if (comp_pred) {
3262       if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA)
3263         if (vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME)
3264           continue;
3265       if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH)
3266         if (ref_frame != best_inter_ref_frame &&
3267             second_ref_frame != best_inter_ref_frame)
3268           continue;
3269     }
3270
3271     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
3272     mbmi->uv_mode = DC_PRED;
3273
3274     // Evaluate all sub-pel filters irrespective of whether we can use
3275     // them for this frame.
3276     mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
3277                                                           : cm->interp_filter;
3278     xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
3279
3280     if (comp_pred) {
3281       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
3282         continue;
3283
3284       mode_excluded = mode_excluded ? mode_excluded
3285                                     : cm->reference_mode == SINGLE_REFERENCE;
3286     } else {
3287       if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME)
3288         mode_excluded = mode_excluded ?
3289             mode_excluded : cm->reference_mode == COMPOUND_REFERENCE;
3290     }
3291
3292     // Select prediction reference frames.
3293     for (i = 0; i < MAX_MB_PLANE; i++) {
3294       xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
3295       if (comp_pred)
3296         xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
3297     }
3298
3299     // If the segment reference frame feature is enabled....
3300     // then do nothing if the current ref frame is not allowed..
3301     if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
3302         vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) !=
3303             (int)ref_frame) {
3304       continue;
3305     // If the segment skip feature is enabled....
3306     // then do nothing if the current mode is not allowed..
3307     } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) &&
3308                (this_mode != ZEROMV && ref_frame != INTRA_FRAME)) {
3309       continue;
3310     // Disable this drop out case if the ref frame
3311     // segment level feature is enabled for this segment. This is to
3312     // prevent the possibility that we end up unable to pick any mode.
3313     } else if (!vp9_segfeature_active(seg, segment_id,
3314                                       SEG_LVL_REF_FRAME)) {
3315       // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
3316       // unless ARNR filtering is enabled in which case we want
3317       // an unfiltered alternative. We allow near/nearest as well
3318       // because they may result in zero-zero MVs but be cheaper.
3319       if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
3320         if ((this_mode != ZEROMV &&
3321              !(this_mode == NEARMV &&
3322                frame_mv[NEARMV][ALTREF_FRAME].as_int == 0) &&
3323              !(this_mode == NEARESTMV &&
3324                frame_mv[NEARESTMV][ALTREF_FRAME].as_int == 0)) ||
3325             ref_frame != ALTREF_FRAME) {
3326           continue;
3327         }
3328       }
3329     }
3330     // TODO(JBB): This is to make up for the fact that we don't have sad
3331     // functions that work when the block size reads outside the umv.  We
3332     // should fix this either by making the motion search just work on
3333     // a representative block in the boundary ( first ) and then implement a
3334     // function that does sads when inside the border..
3335     if (((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) &&
3336         this_mode == NEWMV) {
3337       continue;
3338     }
3339
3340 #ifdef MODE_TEST_HIT_STATS
3341     // TEST/DEBUG CODE
3342     // Keep a rcord of the number of test hits at each size
3343     cpi->mode_test_hits[bsize]++;
3344 #endif
3345
3346
3347     if (ref_frame == INTRA_FRAME) {
3348       TX_SIZE uv_tx;
3349       // Disable intra modes other than DC_PRED for blocks with low variance
3350       // Threshold for intra skipping based on source variance
3351       // TODO(debargha): Specialize the threshold for super block sizes
3352       static const unsigned int skip_intra_var_thresh[BLOCK_SIZES] = {
3353         64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
3354       };
3355       if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
3356           this_mode != DC_PRED &&
3357           x->source_variance < skip_intra_var_thresh[mbmi->sb_type])
3358         continue;
3359       // Only search the oblique modes if the best so far is
3360       // one of the neighboring directional modes
3361       if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
3362           (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
3363         if (vp9_mode_order[best_mode_index].ref_frame[0] > INTRA_FRAME)
3364           continue;
3365       }
3366       mbmi->mode = this_mode;
3367       if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
3368         if (conditional_skipintra(mbmi->mode, best_intra_mode))
3369             continue;
3370       }
3371
3372       super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
3373                       bsize, tx_cache, best_rd);
3374
3375       if (rate_y == INT_MAX)
3376         continue;
3377
3378       uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize);
3379       if (rate_uv_intra[uv_tx] == INT_MAX) {
3380         choose_intra_uv_mode(cpi, ctx, bsize, uv_tx,
3381                              &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx],
3382                              &dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]);
3383       }
3384
3385       rate_uv = rate_uv_tokenonly[uv_tx];
3386       distortion_uv = dist_uv[uv_tx];
3387       skippable = skippable && skip_uv[uv_tx];
3388       mbmi->uv_mode = mode_uv[uv_tx];
3389
3390       rate2 = rate_y + x->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
3391       if (this_mode != DC_PRED && this_mode != TM_PRED)
3392         rate2 += intra_cost_penalty;
3393       distortion2 = distortion_y + distortion_uv;
3394     } else {
3395       mbmi->mode = this_mode;
3396       compmode_cost = vp9_cost_bit(comp_mode_p, second_ref_frame > INTRA_FRAME);
3397       this_rd = handle_inter_mode(cpi, x, tile, bsize,
3398                                   tx_cache,
3399                                   &rate2, &distortion2, &skippable,
3400                                   &rate_y, &distortion_y,
3401                                   &rate_uv, &distortion_uv,
3402                                   &mode_excluded, &disable_skip,
3403                                   &tmp_best_filter, frame_mv,
3404                                   mi_row, mi_col,
3405                                   single_newmv, &total_sse, best_rd);
3406       if (this_rd == INT64_MAX)
3407         continue;
3408     }
3409
3410     if (cm->reference_mode == REFERENCE_MODE_SELECT)
3411       rate2 += compmode_cost;
3412
3413     // Estimate the reference frame signaling cost and add it
3414     // to the rolling cost variable.
3415     if (second_ref_frame > INTRA_FRAME) {
3416       rate2 += ref_costs_comp[ref_frame];
3417     } else {
3418       rate2 += ref_costs_single[ref_frame];
3419     }
3420
3421     if (!disable_skip) {
3422       // Test for the condition where skip block will be activated
3423       // because there are no non zero coefficients and make any
3424       // necessary adjustment for rate. Ignore if skip is coded at
3425       // segment level as the cost wont have been added in.
3426       // Is Mb level skip allowed (i.e. not coded at segment level).
3427       const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id,
3428                                                          SEG_LVL_SKIP);
3429
3430       if (skippable) {
3431         // Back out the coefficient coding costs
3432         rate2 -= (rate_y + rate_uv);
3433         // for best yrd calculation
3434         rate_uv = 0;
3435
3436         if (mb_skip_allowed) {
3437           int prob_skip_cost;
3438
3439           // Cost the skip mb case
3440           vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
3441           if (skip_prob) {
3442             prob_skip_cost = vp9_cost_bit(skip_prob, 1);
3443             rate2 += prob_skip_cost;
3444           }
3445         }
3446       } else if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) {
3447         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
3448             RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
3449           // Add in the cost of the no skip flag.
3450           rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
3451         } else {
3452           // FIXME(rbultje) make this work for splitmv also
3453           rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
3454           distortion2 = total_sse;
3455           assert(total_sse >= 0);
3456           rate2 -= (rate_y + rate_uv);
3457           rate_y = 0;
3458           rate_uv = 0;
3459           this_skip2 = 1;
3460         }
3461       } else if (mb_skip_allowed) {
3462         // Add in the cost of the no skip flag.
3463         rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
3464       }
3465
3466       // Calculate the final RD estimate for this mode.
3467       this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
3468     }
3469
3470     // Keep record of best intra rd
3471     if (!is_inter_block(&xd->mi_8x8[0]->mbmi) &&
3472         this_rd < best_intra_rd) {
3473       best_intra_rd = this_rd;
3474       best_intra_mode = xd->mi_8x8[0]->mbmi.mode;
3475     }
3476
3477     // Keep record of best inter rd with single reference
3478     if (is_inter_block(&xd->mi_8x8[0]->mbmi) &&
3479         !has_second_ref(&xd->mi_8x8[0]->mbmi) &&
3480         !mode_excluded && this_rd < best_inter_rd) {
3481       best_inter_rd = this_rd;
3482       best_inter_ref_frame = ref_frame;
3483     }
3484
3485     if (!disable_skip && ref_frame == INTRA_FRAME) {
3486       for (i = 0; i < REFERENCE_MODES; ++i)
3487         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
3488       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
3489         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
3490     }
3491
3492     // Store the respective mode distortions for later use.
3493     if (mode_distortions[this_mode] == -1
3494         || distortion2 < mode_distortions[this_mode]) {
3495       mode_distortions[this_mode] = distortion2;
3496     }
3497
3498     // Did this mode help.. i.e. is it the new best mode
3499     if (this_rd < best_rd || x->skip) {
3500       int max_plane = MAX_MB_PLANE;
3501       if (!mode_excluded) {
3502         // Note index of best mode so far
3503         best_mode_index = mode_index;
3504
3505         if (ref_frame == INTRA_FRAME) {
3506           /* required for left and above block mv */
3507           mbmi->mv[0].as_int = 0;
3508           max_plane = 1;
3509         }
3510
3511         *returnrate = rate2;
3512         *returndistortion = distortion2;
3513         best_rd = this_rd;
3514         best_mbmode = *mbmi;
3515         best_skip2 = this_skip2;
3516         if (!x->select_txfm_size)
3517           swap_block_ptr(x, ctx, max_plane);
3518         vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
3519                    sizeof(uint8_t) * ctx->num_4x4_blk);
3520
3521         // TODO(debargha): enhance this test with a better distortion prediction
3522         // based on qp, activity mask and history
3523         if ((cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
3524             (mode_index > MIN_EARLY_TERM_INDEX)) {
3525           const int qstep = xd->plane[0].dequant[1];
3526           // TODO(debargha): Enhance this by specializing for each mode_index
3527           int scale = 4;
3528           if (x->source_variance < UINT_MAX) {
3529             const int var_adjust = (x->source_variance < 16);
3530             scale -= var_adjust;
3531           }
3532           if (ref_frame > INTRA_FRAME &&
3533               distortion2 * scale < qstep * qstep) {
3534             early_term = 1;
3535           }
3536         }
3537       }
3538     }
3539
3540     /* keep record of best compound/single-only prediction */
3541     if (!disable_skip && ref_frame != INTRA_FRAME) {
3542       int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
3543
3544       if (cm->reference_mode == REFERENCE_MODE_SELECT) {
3545         single_rate = rate2 - compmode_cost;
3546         hybrid_rate = rate2;
3547       } else {
3548         single_rate = rate2;
3549         hybrid_rate = rate2 + compmode_cost;
3550       }
3551
3552       single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
3553       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
3554
3555       if (second_ref_frame <= INTRA_FRAME &&
3556           single_rd < best_pred_rd[SINGLE_REFERENCE]) {
3557         best_pred_rd[SINGLE_REFERENCE] = single_rd;
3558       } else if (second_ref_frame > INTRA_FRAME &&
3559                  single_rd < best_pred_rd[COMPOUND_REFERENCE]) {
3560         best_pred_rd[COMPOUND_REFERENCE] = single_rd;
3561       }
3562       if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
3563         best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
3564     }
3565
3566     /* keep record of best filter type */
3567     if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME &&
3568         cm->interp_filter != BILINEAR) {
3569       int64_t ref = cpi->rd_filter_cache[cm->interp_filter == SWITCHABLE ?
3570                               SWITCHABLE_FILTERS : cm->interp_filter];
3571
3572       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
3573         int64_t adj_rd;
3574         if (ref == INT64_MAX)
3575           adj_rd = 0;
3576         else if (cpi->rd_filter_cache[i] == INT64_MAX)
3577           // when early termination is triggered, the encoder does not have
3578           // access to the rate-distortion cost. it only knows that the cost
3579           // should be above the maximum valid value. hence it takes the known
3580           // maximum plus an arbitrary constant as the rate-distortion cost.
3581           adj_rd = cpi->mask_filter_rd - ref + 10;
3582         else
3583           adj_rd = cpi->rd_filter_cache[i] - ref;
3584
3585         adj_rd += this_rd;
3586         best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
3587       }
3588     }
3589
3590     /* keep record of best txfm size */
3591     if (bsize < BLOCK_32X32) {
3592       if (bsize < BLOCK_16X16)
3593         tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8];
3594
3595       tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16];
3596     }
3597     if (!mode_excluded && this_rd != INT64_MAX) {
3598       for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) {
3599         int64_t adj_rd = INT64_MAX;
3600         adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode];
3601
3602         if (adj_rd < best_tx_rd[i])
3603           best_tx_rd[i] = adj_rd;
3604       }
3605     }
3606
3607     if (early_term)
3608       break;
3609
3610     if (x->skip && !comp_pred)
3611       break;
3612   }
3613
3614   if (best_rd >= best_rd_so_far)
3615     return INT64_MAX;
3616
3617   // If we used an estimate for the uv intra rd in the loop above...
3618   if (cpi->sf.use_uv_intra_rd_estimate) {
3619     // Do Intra UV best rd mode selection if best mode choice above was intra.
3620     if (vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME) {
3621       TX_SIZE uv_tx_size;
3622       *mbmi = best_mbmode;
3623       uv_tx_size = get_uv_tx_size(mbmi);
3624       rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
3625                               &rate_uv_tokenonly[uv_tx_size],
3626                               &dist_uv[uv_tx_size],
3627                               &skip_uv[uv_tx_size],
3628                               bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize,
3629                               uv_tx_size);
3630     }
3631   }
3632
3633   // Flag all modes that have a distortion thats > 2x the best we found at
3634   // this level.
3635   for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) {
3636     if (mode_index == NEARESTMV || mode_index == NEARMV || mode_index == NEWMV)
3637       continue;
3638
3639     if (mode_distortions[mode_index] > 2 * *returndistortion) {
3640       ctx->modes_with_high_error |= (1 << mode_index);
3641     }
3642   }
3643
3644   assert((cm->interp_filter == SWITCHABLE) ||
3645          (cm->interp_filter == best_mbmode.interp_filter) ||
3646          !is_inter_block(&best_mbmode));
3647
3648   // Updating rd_thresh_freq_fact[] here means that the different
3649   // partition/block sizes are handled independently based on the best
3650   // choice for the current partition. It may well be better to keep a scaled
3651   // best rd so far value and update rd_thresh_freq_fact based on the mode/size
3652   // combination that wins out.
3653   if (cpi->sf.adaptive_rd_thresh) {
3654     for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
3655       if (mode_index == best_mode_index) {
3656         cpi->rd_thresh_freq_fact[bsize][mode_index] -=
3657           (cpi->rd_thresh_freq_fact[bsize][mode_index] >> 3);
3658       } else {
3659         cpi->rd_thresh_freq_fact[bsize][mode_index] += RD_THRESH_INC;
3660         if (cpi->rd_thresh_freq_fact[bsize][mode_index] >
3661             (cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT)) {
3662           cpi->rd_thresh_freq_fact[bsize][mode_index] =
3663             cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT;
3664         }
3665       }
3666     }
3667   }
3668
3669   // macroblock modes
3670   *mbmi = best_mbmode;
3671   x->skip |= best_skip2;
3672
3673   for (i = 0; i < REFERENCE_MODES; ++i) {
3674     if (best_pred_rd[i] == INT64_MAX)
3675       best_pred_diff[i] = INT_MIN;
3676     else
3677       best_pred_diff[i] = best_rd - best_pred_rd[i];
3678   }
3679
3680   if (!x->skip) {
3681     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
3682       if (best_filter_rd[i] == INT64_MAX)
3683         best_filter_diff[i] = 0;
3684       else
3685         best_filter_diff[i] = best_rd - best_filter_rd[i];
3686     }
3687     if (cm->interp_filter == SWITCHABLE)
3688       assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
3689   } else {
3690     vp9_zero(best_filter_diff);
3691   }
3692
3693   if (!x->skip) {
3694     for (i = 0; i < TX_MODES; i++) {
3695       if (best_tx_rd[i] == INT64_MAX)
3696         best_tx_diff[i] = 0;
3697       else
3698         best_tx_diff[i] = best_rd - best_tx_rd[i];
3699     }
3700   } else {
3701     vp9_zero(best_tx_diff);
3702   }
3703
3704   set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
3705   store_coding_context(x, ctx, best_mode_index,
3706                        &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
3707                        &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
3708                                       mbmi->ref_frame[1]][0],
3709                        best_pred_diff, best_tx_diff, best_filter_diff);
3710
3711   return best_rd;
3712 }
3713
3714
3715 int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
3716                                       const TileInfo *const tile,
3717                                       int mi_row, int mi_col,
3718                                       int *returnrate,
3719                                       int64_t *returndistortion,
3720                                       BLOCK_SIZE bsize,
3721                                       PICK_MODE_CONTEXT *ctx,
3722                                       int64_t best_rd_so_far) {
3723   VP9_COMMON *cm = &cpi->common;
3724   MACROBLOCKD *xd = &x->e_mbd;
3725   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
3726   const struct segmentation *seg = &cm->seg;
3727   const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
3728   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
3729   unsigned char segment_id = mbmi->segment_id;
3730   int comp_pred, i;
3731   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
3732   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
3733   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
3734                                     VP9_ALT_FLAG };
3735   int64_t best_rd = best_rd_so_far;
3736   int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
3737   int64_t best_tx_rd[TX_MODES];
3738   int64_t best_tx_diff[TX_MODES];
3739   int64_t best_pred_diff[REFERENCE_MODES];
3740   int64_t best_pred_rd[REFERENCE_MODES];
3741   int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
3742   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
3743   MB_MODE_INFO best_mbmode = { 0 };
3744   int mode_index, best_mode_index = 0;
3745   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
3746   vp9_prob comp_mode_p;
3747   int64_t best_inter_rd = INT64_MAX;
3748   MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
3749   INTERP_FILTER tmp_best_filter = SWITCHABLE;
3750   int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
3751   int64_t dist_uv[TX_SIZES];
3752   int skip_uv[TX_SIZES];
3753   MB_PREDICTION_MODE mode_uv[TX_SIZES] = { 0 };
3754   int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q);
3755   int_mv seg_mvs[4][MAX_REF_FRAMES];
3756   b_mode_info best_bmodes[4];
3757   int best_skip2 = 0;
3758
3759   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
3760   vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4);
3761
3762   for (i = 0; i < 4; i++) {
3763     int j;
3764     for (j = 0; j < MAX_REF_FRAMES; j++)
3765       seg_mvs[i][j].as_int = INVALID_MV;
3766   }
3767
3768   estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp,
3769                            &comp_mode_p);
3770
3771   for (i = 0; i < REFERENCE_MODES; ++i)
3772     best_pred_rd[i] = INT64_MAX;
3773   for (i = 0; i < TX_MODES; i++)
3774     best_tx_rd[i] = INT64_MAX;
3775   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
3776     best_filter_rd[i] = INT64_MAX;
3777   for (i = 0; i < TX_SIZES; i++)
3778     rate_uv_intra[i] = INT_MAX;
3779
3780   *returnrate = INT_MAX;
3781
3782   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
3783     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
3784       vp9_setup_buffer_inter(cpi, x, tile,
3785                              ref_frame, block_size, mi_row, mi_col,
3786                              frame_mv[NEARESTMV], frame_mv[NEARMV],
3787                              yv12_mb);
3788     }
3789     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
3790     frame_mv[ZEROMV][ref_frame].as_int = 0;
3791   }
3792
3793   cpi->ref_frame_mask = 0;
3794   for (ref_frame = LAST_FRAME;
3795        ref_frame <= ALTREF_FRAME && cpi->sf.reference_masking; ++ref_frame) {
3796     int i;
3797     for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
3798       if ((x->pred_mv_sad[ref_frame] >> 1) > x->pred_mv_sad[i]) {
3799         cpi->ref_frame_mask |= (1 << ref_frame);
3800         break;
3801       }
3802     }
3803   }
3804
3805   for (mode_index = 0; mode_index < MAX_REFS; ++mode_index) {
3806     int mode_excluded = 0;
3807     int64_t this_rd = INT64_MAX;
3808     int disable_skip = 0;
3809     int compmode_cost = 0;
3810     int rate2 = 0, rate_y = 0, rate_uv = 0;
3811     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
3812     int skippable = 0;
3813     int64_t tx_cache[TX_MODES];
3814     int i;
3815     int this_skip2 = 0;
3816     int64_t total_sse = INT_MAX;
3817     int early_term = 0;
3818
3819     for (i = 0; i < TX_MODES; ++i)
3820       tx_cache[i] = INT64_MAX;
3821
3822     x->skip = 0;
3823     ref_frame = vp9_ref_order[mode_index].ref_frame[0];
3824     second_ref_frame = vp9_ref_order[mode_index].ref_frame[1];
3825
3826     // Look at the reference frame of the best mode so far and set the
3827     // skip mask to look at a subset of the remaining modes.
3828     if (mode_index > 2 && cpi->sf.mode_skip_start < MAX_MODES) {
3829       if (mode_index == 3) {
3830         switch (vp9_ref_order[best_mode_index].ref_frame[0]) {
3831           case INTRA_FRAME:
3832             cpi->mode_skip_mask = 0;
3833             break;
3834           case LAST_FRAME:
3835             cpi->mode_skip_mask = 0x0010;
3836             break;
3837           case GOLDEN_FRAME:
3838             cpi->mode_skip_mask = 0x0008;
3839             break;
3840           case ALTREF_FRAME:
3841             cpi->mode_skip_mask = 0x0000;
3842             break;
3843           case NONE:
3844           case MAX_REF_FRAMES:
3845             assert(0 && "Invalid Reference frame");
3846         }
3847       }
3848       if (cpi->mode_skip_mask & ((int64_t)1 << mode_index))
3849         continue;
3850     }
3851
3852     // Test best rd so far against threshold for trying this mode.
3853     if ((best_rd <
3854          ((int64_t)cpi->rd_thresh_sub8x8[segment_id][bsize][mode_index] *
3855           cpi->rd_thresh_freq_sub8x8[bsize][mode_index] >> 5)) ||
3856         cpi->rd_thresh_sub8x8[segment_id][bsize][mode_index] == INT_MAX)
3857       continue;
3858
3859     // Do not allow compound prediction if the segment level reference
3860     // frame feature is in use as in this case there can only be one reference.
3861     if ((second_ref_frame > INTRA_FRAME) &&
3862          vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
3863       continue;
3864
3865     mbmi->ref_frame[0] = ref_frame;
3866     mbmi->ref_frame[1] = second_ref_frame;
3867
3868     if (!(ref_frame == INTRA_FRAME
3869         || (cpi->ref_frame_flags & flag_list[ref_frame]))) {
3870       continue;
3871     }
3872     if (!(second_ref_frame == NONE
3873         || (cpi->ref_frame_flags & flag_list[second_ref_frame]))) {
3874       continue;
3875     }
3876
3877     comp_pred = second_ref_frame > INTRA_FRAME;
3878     if (comp_pred) {
3879       if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA)
3880         if (vp9_ref_order[best_mode_index].ref_frame[0] == INTRA_FRAME)
3881           continue;
3882       if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH)
3883         if (ref_frame != best_inter_ref_frame &&
3884             second_ref_frame != best_inter_ref_frame)
3885           continue;
3886     }
3887
3888     // TODO(jingning, jkoleszar): scaling reference frame not supported for
3889     // sub8x8 blocks.
3890     if (ref_frame > 0 && vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf))
3891       continue;
3892
3893     if (second_ref_frame > 0 &&
3894         vp9_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf))
3895       continue;
3896
3897     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
3898     mbmi->uv_mode = DC_PRED;
3899
3900     // Evaluate all sub-pel filters irrespective of whether we can use
3901     // them for this frame.
3902     mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
3903                                                           : cm->interp_filter;
3904     xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
3905
3906     if (comp_pred) {
3907       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
3908         continue;
3909
3910       mode_excluded = mode_excluded ? mode_excluded
3911                                     : cm->reference_mode == SINGLE_REFERENCE;
3912     } else {
3913       if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME) {
3914         mode_excluded = mode_excluded ?
3915             mode_excluded : cm->reference_mode == COMPOUND_REFERENCE;
3916       }
3917     }
3918
3919     // Select prediction reference frames.
3920     for (i = 0; i < MAX_MB_PLANE; i++) {
3921       xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
3922       if (comp_pred)
3923         xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
3924     }
3925
3926     // If the segment reference frame feature is enabled....
3927     // then do nothing if the current ref frame is not allowed..
3928     if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
3929         vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) !=
3930             (int)ref_frame) {
3931       continue;
3932     // If the segment skip feature is enabled....
3933     // then do nothing if the current mode is not allowed..
3934     } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) &&
3935                ref_frame != INTRA_FRAME) {
3936       continue;
3937     // Disable this drop out case if the ref frame
3938     // segment level feature is enabled for this segment. This is to
3939     // prevent the possibility that we end up unable to pick any mode.
3940     } else if (!vp9_segfeature_active(seg, segment_id,
3941                                       SEG_LVL_REF_FRAME)) {
3942       // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
3943       // unless ARNR filtering is enabled in which case we want
3944       // an unfiltered alternative. We allow near/nearest as well
3945       // because they may result in zero-zero MVs but be cheaper.
3946       if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
3947         continue;
3948     }
3949
3950 #ifdef MODE_TEST_HIT_STATS
3951     // TEST/DEBUG CODE
3952     // Keep a rcord of the number of test hits at each size
3953     cpi->mode_test_hits[bsize]++;
3954 #endif
3955
3956     if (ref_frame == INTRA_FRAME) {
3957       int rate;
3958       mbmi->tx_size = TX_4X4;
3959       if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y,
3960                                        &distortion_y, best_rd) >= best_rd)
3961         continue;
3962       rate2 += rate;
3963       rate2 += intra_cost_penalty;
3964       distortion2 += distortion_y;
3965
3966       if (rate_uv_intra[TX_4X4] == INT_MAX) {
3967         choose_intra_uv_mode(cpi, ctx, bsize, TX_4X4,
3968                              &rate_uv_intra[TX_4X4],
3969                              &rate_uv_tokenonly[TX_4X4],
3970                              &dist_uv[TX_4X4], &skip_uv[TX_4X4],
3971                              &mode_uv[TX_4X4]);
3972       }
3973       rate2 += rate_uv_intra[TX_4X4];
3974       rate_uv = rate_uv_tokenonly[TX_4X4];
3975       distortion2 += dist_uv[TX_4X4];
3976       distortion_uv = dist_uv[TX_4X4];
3977       mbmi->uv_mode = mode_uv[TX_4X4];
3978       tx_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
3979       for (i = 0; i < TX_MODES; ++i)
3980         tx_cache[i] = tx_cache[ONLY_4X4];
3981     } else {
3982       int rate;
3983       int64_t distortion;
3984       int64_t this_rd_thresh;
3985       int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
3986       int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
3987       int64_t tmp_best_distortion = INT_MAX, tmp_best_sse, uv_sse;
3988       int tmp_best_skippable = 0;
3989       int switchable_filter_index;
3990       int_mv *second_ref = comp_pred ?
3991                              &mbmi->ref_mvs[second_ref_frame][0] : NULL;
3992       b_mode_info tmp_best_bmodes[16];
3993       MB_MODE_INFO tmp_best_mbmode;
3994       BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
3995       int pred_exists = 0;
3996       int uv_skippable;
3997
3998       this_rd_thresh = (ref_frame == LAST_FRAME) ?
3999           cpi->rd_thresh_sub8x8[segment_id][bsize][THR_LAST] :
4000           cpi->rd_thresh_sub8x8[segment_id][bsize][THR_ALTR];
4001       this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
4002           cpi->rd_thresh_sub8x8[segment_id][bsize][THR_GOLD] : this_rd_thresh;
4003       xd->mi_8x8[0]->mbmi.tx_size = TX_4X4;
4004
4005       cpi->mask_filter_rd = 0;
4006       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
4007         cpi->rd_filter_cache[i] = INT64_MAX;
4008
4009       if (cm->interp_filter != BILINEAR) {
4010         tmp_best_filter = EIGHTTAP;
4011         if (x->source_variance <
4012             cpi->sf.disable_filter_search_var_thresh) {
4013           tmp_best_filter = EIGHTTAP;
4014         } else if (cpi->sf.adaptive_pred_interp_filter == 1 &&
4015                    ctx->pred_interp_filter < SWITCHABLE) {
4016           tmp_best_filter = ctx->pred_interp_filter;
4017         } else if (cpi->sf.adaptive_pred_interp_filter == 2) {
4018           tmp_best_filter = ctx->pred_interp_filter < SWITCHABLE ?
4019                               ctx->pred_interp_filter : 0;
4020         } else {
4021           for (switchable_filter_index = 0;
4022                switchable_filter_index < SWITCHABLE_FILTERS;
4023                ++switchable_filter_index) {
4024             int newbest, rs;
4025             int64_t rs_rd;
4026             mbmi->interp_filter = switchable_filter_index;
4027             xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
4028             tmp_rd = rd_pick_best_mbsegmentation(cpi, x, tile,
4029                                                  &mbmi->ref_mvs[ref_frame][0],
4030                                                  second_ref,
4031                                                  best_yrd,
4032                                                  &rate, &rate_y, &distortion,
4033                                                  &skippable, &total_sse,
4034                                                  (int)this_rd_thresh, seg_mvs,
4035                                                  bsi, switchable_filter_index,
4036                                                  mi_row, mi_col);
4037
4038             if (tmp_rd == INT64_MAX)
4039               continue;
4040             rs = get_switchable_rate(x);
4041             rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
4042             cpi->rd_filter_cache[switchable_filter_index] = tmp_rd;
4043             cpi->rd_filter_cache[SWITCHABLE_FILTERS] =
4044                 MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS],
4045                     tmp_rd + rs_rd);
4046             if (cm->interp_filter == SWITCHABLE)
4047               tmp_rd += rs_rd;
4048
4049             cpi->mask_filter_rd = MAX(cpi->mask_filter_rd, tmp_rd);
4050
4051             newbest = (tmp_rd < tmp_best_rd);
4052             if (newbest) {
4053               tmp_best_filter = mbmi->interp_filter;
4054               tmp_best_rd = tmp_rd;
4055             }
4056             if ((newbest && cm->interp_filter == SWITCHABLE) ||
4057                 (mbmi->interp_filter == cm->interp_filter &&
4058                  cm->interp_filter != SWITCHABLE)) {
4059               tmp_best_rdu = tmp_rd;
4060               tmp_best_rate = rate;
4061               tmp_best_ratey = rate_y;
4062               tmp_best_distortion = distortion;
4063               tmp_best_sse = total_sse;
4064               tmp_best_skippable = skippable;
4065               tmp_best_mbmode = *mbmi;
4066               for (i = 0; i < 4; i++) {
4067                 tmp_best_bmodes[i] = xd->mi_8x8[0]->bmi[i];
4068                 x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i];
4069               }
4070               pred_exists = 1;
4071               if (switchable_filter_index == 0 &&
4072                   cpi->sf.use_rd_breakout &&
4073                   best_rd < INT64_MAX) {
4074                 if (tmp_best_rdu / 2 > best_rd) {
4075                   // skip searching the other filters if the first is
4076                   // already substantially larger than the best so far
4077                   tmp_best_filter = mbmi->interp_filter;
4078                   tmp_best_rdu = INT64_MAX;
4079                   break;
4080                 }
4081               }
4082             }
4083           }  // switchable_filter_index loop
4084         }
4085       }
4086
4087       if (tmp_best_rdu == INT64_MAX && pred_exists)
4088         continue;
4089
4090       mbmi->interp_filter = (cm->interp_filter == SWITCHABLE ?
4091                              tmp_best_filter : cm->interp_filter);
4092       xd->interp_kernel = vp9_get_interp_kernel(mbmi->interp_filter);
4093       if (!pred_exists) {
4094         // Handles the special case when a filter that is not in the
4095         // switchable list (bilinear, 6-tap) is indicated at the frame level
4096         tmp_rd = rd_pick_best_mbsegmentation(cpi, x, tile,
4097                      &mbmi->ref_mvs[ref_frame][0],
4098                      second_ref,
4099                      best_yrd,
4100                      &rate, &rate_y, &distortion,
4101                      &skippable, &total_sse,
4102                      (int)this_rd_thresh, seg_mvs,
4103                      bsi, 0,
4104                      mi_row, mi_col);
4105         if (tmp_rd == INT64_MAX)
4106           continue;
4107       } else {
4108         total_sse = tmp_best_sse;
4109         rate = tmp_best_rate;
4110         rate_y = tmp_best_ratey;
4111         distortion = tmp_best_distortion;
4112         skippable = tmp_best_skippable;
4113         *mbmi = tmp_best_mbmode;
4114         for (i = 0; i < 4; i++)
4115           xd->mi_8x8[0]->bmi[i] = tmp_best_bmodes[i];
4116       }
4117
4118       rate2 += rate;
4119       distortion2 += distortion;
4120
4121       if (cm->interp_filter == SWITCHABLE)
4122         rate2 += get_switchable_rate(x);
4123
4124       if (!mode_excluded)
4125         mode_excluded = comp_pred ? cm->reference_mode == SINGLE_REFERENCE
4126                                   : cm->reference_mode == COMPOUND_REFERENCE;
4127
4128       compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
4129
4130       tmp_best_rdu = best_rd -
4131           MIN(RDCOST(x->rdmult, x->rddiv, rate2, distortion2),
4132               RDCOST(x->rdmult, x->rddiv, 0, total_sse));
4133
4134       if (tmp_best_rdu > 0) {
4135         // If even the 'Y' rd value of split is higher than best so far
4136         // then dont bother looking at UV
4137         vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
4138                                         BLOCK_8X8);
4139         super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
4140                          &uv_sse, BLOCK_8X8, tmp_best_rdu);
4141         if (rate_uv == INT_MAX)
4142           continue;
4143         rate2 += rate_uv;
4144         distortion2 += distortion_uv;
4145         skippable = skippable && uv_skippable;
4146         total_sse += uv_sse;
4147
4148         tx_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
4149         for (i = 0; i < TX_MODES; ++i)
4150           tx_cache[i] = tx_cache[ONLY_4X4];
4151       }
4152     }
4153
4154     if (cm->reference_mode == REFERENCE_MODE_SELECT)
4155       rate2 += compmode_cost;
4156
4157     // Estimate the reference frame signaling cost and add it
4158     // to the rolling cost variable.
4159     if (second_ref_frame > INTRA_FRAME) {
4160       rate2 += ref_costs_comp[ref_frame];
4161     } else {
4162       rate2 += ref_costs_single[ref_frame];
4163     }
4164
4165     if (!disable_skip) {
4166       // Test for the condition where skip block will be activated
4167       // because there are no non zero coefficients and make any
4168       // necessary adjustment for rate. Ignore if skip is coded at
4169       // segment level as the cost wont have been added in.
4170       // Is Mb level skip allowed (i.e. not coded at segment level).
4171       const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id,
4172                                                          SEG_LVL_SKIP);
4173
4174       if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) {
4175         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
4176             RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
4177           // Add in the cost of the no skip flag.
4178           rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
4179         } else {
4180           // FIXME(rbultje) make this work for splitmv also
4181           rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
4182           distortion2 = total_sse;
4183           assert(total_sse >= 0);
4184           rate2 -= (rate_y + rate_uv);
4185           rate_y = 0;
4186           rate_uv = 0;
4187           this_skip2 = 1;
4188         }
4189       } else if (mb_skip_allowed) {
4190         // Add in the cost of the no skip flag.
4191         rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
4192       }
4193
4194       // Calculate the final RD estimate for this mode.
4195       this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
4196     }
4197
4198     // Keep record of best inter rd with single reference
4199     if (is_inter_block(&xd->mi_8x8[0]->mbmi) &&
4200         !has_second_ref(&xd->mi_8x8[0]->mbmi) &&
4201         !mode_excluded &&
4202         this_rd < best_inter_rd) {
4203       best_inter_rd = this_rd;
4204       best_inter_ref_frame = ref_frame;
4205     }
4206
4207     if (!disable_skip && ref_frame == INTRA_FRAME) {
4208       for (i = 0; i < REFERENCE_MODES; ++i)
4209         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
4210       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
4211         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
4212     }
4213
4214     // Did this mode help.. i.e. is it the new best mode
4215     if (this_rd < best_rd || x->skip) {
4216       if (!mode_excluded) {
4217         int max_plane = MAX_MB_PLANE;
4218         // Note index of best mode so far
4219         best_mode_index = mode_index;
4220
4221         if (ref_frame == INTRA_FRAME) {
4222           /* required for left and above block mv */
4223           mbmi->mv[0].as_int = 0;
4224           max_plane = 1;
4225         }
4226
4227         *returnrate = rate2;
4228         *returndistortion = distortion2;
4229         best_rd = this_rd;
4230         best_yrd = best_rd -
4231                    RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
4232         best_mbmode = *mbmi;
4233         best_skip2 = this_skip2;
4234         if (!x->select_txfm_size)
4235           swap_block_ptr(x, ctx, max_plane);
4236         vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
4237                    sizeof(uint8_t) * ctx->num_4x4_blk);
4238
4239         for (i = 0; i < 4; i++)
4240           best_bmodes[i] = xd->mi_8x8[0]->bmi[i];
4241
4242         // TODO(debargha): enhance this test with a better distortion prediction
4243         // based on qp, activity mask and history
4244         if ((cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
4245             (mode_index > MIN_EARLY_TERM_INDEX)) {
4246           const int qstep = xd->plane[0].dequant[1];
4247           // TODO(debargha): Enhance this by specializing for each mode_index
4248           int scale = 4;
4249           if (x->source_variance < UINT_MAX) {
4250             const int var_adjust = (x->source_variance < 16);
4251             scale -= var_adjust;
4252           }
4253           if (ref_frame > INTRA_FRAME &&
4254               distortion2 * scale < qstep * qstep) {
4255             early_term = 1;
4256           }
4257         }
4258       }
4259     }
4260
4261     /* keep record of best compound/single-only prediction */
4262     if (!disable_skip && ref_frame != INTRA_FRAME) {
4263       int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
4264
4265       if (cm->reference_mode == REFERENCE_MODE_SELECT) {
4266         single_rate = rate2 - compmode_cost;
4267         hybrid_rate = rate2;
4268       } else {
4269         single_rate = rate2;
4270         hybrid_rate = rate2 + compmode_cost;
4271       }
4272
4273       single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
4274       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
4275
4276       if (second_ref_frame <= INTRA_FRAME &&
4277           single_rd < best_pred_rd[SINGLE_REFERENCE]) {
4278         best_pred_rd[SINGLE_REFERENCE] = single_rd;
4279       } else if (second_ref_frame > INTRA_FRAME &&
4280                  single_rd < best_pred_rd[COMPOUND_REFERENCE]) {
4281         best_pred_rd[COMPOUND_REFERENCE] = single_rd;
4282       }
4283       if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
4284         best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
4285     }
4286
4287     /* keep record of best filter type */
4288     if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME &&
4289         cm->interp_filter != BILINEAR) {
4290       int64_t ref = cpi->rd_filter_cache[cm->interp_filter == SWITCHABLE ?
4291                               SWITCHABLE_FILTERS : cm->interp_filter];
4292       int64_t adj_rd;
4293       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
4294         if (ref == INT64_MAX)
4295           adj_rd = 0;
4296         else if (cpi->rd_filter_cache[i] == INT64_MAX)
4297           // when early termination is triggered, the encoder does not have
4298           // access to the rate-distortion cost. it only knows that the cost
4299           // should be above the maximum valid value. hence it takes the known
4300           // maximum plus an arbitrary constant as the rate-distortion cost.
4301           adj_rd = cpi->mask_filter_rd - ref + 10;
4302         else
4303           adj_rd = cpi->rd_filter_cache[i] - ref;
4304
4305         adj_rd += this_rd;
4306         best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
4307       }
4308     }
4309
4310     /* keep record of best txfm size */
4311     if (bsize < BLOCK_32X32) {
4312       if (bsize < BLOCK_16X16) {
4313         tx_cache[ALLOW_8X8] = tx_cache[ONLY_4X4];
4314         tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8];
4315       }
4316       tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16];
4317     }
4318     if (!mode_excluded && this_rd != INT64_MAX) {
4319       for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) {
4320         int64_t adj_rd = INT64_MAX;
4321         if (ref_frame > INTRA_FRAME)
4322           adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode];
4323         else
4324           adj_rd = this_rd;
4325
4326         if (adj_rd < best_tx_rd[i])
4327           best_tx_rd[i] = adj_rd;
4328       }
4329     }
4330
4331     if (early_term)
4332       break;
4333
4334     if (x->skip && !comp_pred)
4335       break;
4336   }
4337
4338   if (best_rd >= best_rd_so_far)
4339     return INT64_MAX;
4340
4341   // If we used an estimate for the uv intra rd in the loop above...
4342   if (cpi->sf.use_uv_intra_rd_estimate) {
4343     // Do Intra UV best rd mode selection if best mode choice above was intra.
4344     if (vp9_ref_order[best_mode_index].ref_frame[0] == INTRA_FRAME) {
4345       TX_SIZE uv_tx_size;
4346       *mbmi = best_mbmode;
4347       uv_tx_size = get_uv_tx_size(mbmi);
4348       rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
4349                               &rate_uv_tokenonly[uv_tx_size],
4350                               &dist_uv[uv_tx_size],
4351                               &skip_uv[uv_tx_size],
4352                               BLOCK_8X8, uv_tx_size);
4353     }
4354   }
4355
4356   if (best_rd == INT64_MAX && bsize < BLOCK_8X8) {
4357     *returnrate = INT_MAX;
4358     *returndistortion = INT_MAX;
4359     return best_rd;
4360   }
4361
4362   assert((cm->interp_filter == SWITCHABLE) ||
4363          (cm->interp_filter == best_mbmode.interp_filter) ||
4364          !is_inter_block(&best_mbmode));
4365
4366   // Updating rd_thresh_freq_fact[] here means that the different
4367   // partition/block sizes are handled independently based on the best
4368   // choice for the current partition. It may well be better to keep a scaled
4369   // best rd so far value and update rd_thresh_freq_fact based on the mode/size
4370   // combination that wins out.
4371   if (cpi->sf.adaptive_rd_thresh) {
4372     for (mode_index = 0; mode_index < MAX_REFS; ++mode_index) {
4373       if (mode_index == best_mode_index) {
4374         cpi->rd_thresh_freq_sub8x8[bsize][mode_index] -=
4375           (cpi->rd_thresh_freq_sub8x8[bsize][mode_index] >> 3);
4376       } else {
4377         cpi->rd_thresh_freq_sub8x8[bsize][mode_index] += RD_THRESH_INC;
4378         if (cpi->rd_thresh_freq_sub8x8[bsize][mode_index] >
4379             (cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT)) {
4380           cpi->rd_thresh_freq_sub8x8[bsize][mode_index] =
4381             cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT;
4382         }
4383       }
4384     }
4385   }
4386
4387   // macroblock modes
4388   *mbmi = best_mbmode;
4389   x->skip |= best_skip2;
4390   if (!is_inter_block(&best_mbmode)) {
4391     for (i = 0; i < 4; i++)
4392       xd->mi_8x8[0]->bmi[i].as_mode = best_bmodes[i].as_mode;
4393   } else {
4394     for (i = 0; i < 4; ++i)
4395       vpx_memcpy(&xd->mi_8x8[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info));
4396
4397     mbmi->mv[0].as_int = xd->mi_8x8[0]->bmi[3].as_mv[0].as_int;
4398     mbmi->mv[1].as_int = xd->mi_8x8[0]->bmi[3].as_mv[1].as_int;
4399   }
4400
4401   for (i = 0; i < REFERENCE_MODES; ++i) {
4402     if (best_pred_rd[i] == INT64_MAX)
4403       best_pred_diff[i] = INT_MIN;
4404     else
4405       best_pred_diff[i] = best_rd - best_pred_rd[i];
4406   }
4407
4408   if (!x->skip) {
4409     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
4410       if (best_filter_rd[i] == INT64_MAX)
4411         best_filter_diff[i] = 0;
4412       else
4413         best_filter_diff[i] = best_rd - best_filter_rd[i];
4414     }
4415     if (cm->interp_filter == SWITCHABLE)
4416       assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
4417   } else {
4418     vp9_zero(best_filter_diff);
4419   }
4420
4421   if (!x->skip) {
4422     for (i = 0; i < TX_MODES; i++) {
4423       if (best_tx_rd[i] == INT64_MAX)
4424         best_tx_diff[i] = 0;
4425       else
4426         best_tx_diff[i] = best_rd - best_tx_rd[i];
4427     }
4428   } else {
4429     vp9_zero(best_tx_diff);
4430   }
4431
4432   set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
4433   store_coding_context(x, ctx, best_mode_index,
4434                        &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
4435                        &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
4436                                       mbmi->ref_frame[1]][0],
4437                        best_pred_diff, best_tx_diff, best_filter_diff);
4438
4439   return best_rd;
4440 }