src/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c

   1 /*
   2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11
  12 #include "./vp9_rtcd.h"
  13 #include "./vpx_config.h"
  14
  15 #include "vpx_mem/vpx_mem.h"
  16
  17 #include "vp9/common/vp9_idct.h"
  18 #include "vp9/common/vp9_reconinter.h"
  19 #include "vp9/common/vp9_reconintra.h"
  20 #include "vp9/common/vp9_systemdependent.h"
  21
  22 #include "vp9/encoder/vp9_encodemb.h"
  23 #include "vp9/encoder/vp9_quantize.h"
  24 #include "vp9/encoder/vp9_rdopt.h"
  25 #include "vp9/encoder/vp9_tokenize.h"
  26
  27 struct optimize_ctx {
  28   ENTROPY_CONTEXT ta[MAX_MB_PLANE][16];
  29   ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
  30 };
  31
  32 struct encode_b_args {
  33   MACROBLOCK *x;
  34   struct optimize_ctx *ctx;
  35   unsigned char *skip;
  36 };
  37
  38 void vp9_subtract_block_c(int rows, int cols,
  39                           int16_t *diff_ptr, ptrdiff_t diff_stride,
  40                           const uint8_t *src_ptr, ptrdiff_t src_stride,
  41                           const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
  42   int r, c;
  43
  44   for (r = 0; r < rows; r++) {
  45     for (c = 0; c < cols; c++)
  46       diff_ptr[c] = src_ptr[c] - pred_ptr[c];
  47
  48     diff_ptr += diff_stride;
  49     pred_ptr += pred_stride;
  50     src_ptr  += src_stride;
  51   }
  52 }
  53
  54 static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
  55   struct macroblock_plane *const p = &x->plane[plane];
  56   const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
  57   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
  58   const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
  59   const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
  60
  61   vp9_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
  62                      pd->dst.buf, pd->dst.stride);
  63 }
  64
  65 void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
  66   subtract_plane(x, bsize, 0);
  67 }
  68
  69 void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE bsize) {
  70   int i;
  71
  72   for (i = 1; i < MAX_MB_PLANE; i++)
  73     subtract_plane(x, bsize, i);
  74 }
  75
  76 void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
  77   vp9_subtract_sby(x, bsize);
  78   vp9_subtract_sbuv(x, bsize);
  79 }
  80
  81 #define RDTRUNC(RM, DM, R, D) ((128 + (R) * (RM)) & 0xFF)
  82 typedef struct vp9_token_state vp9_token_state;
  83
  84 struct vp9_token_state {
  85   int           rate;
  86   int           error;
  87   int           next;
  88   signed char   token;
  89   short         qc;
  90 };
  91
  92 // TODO(jimbankoski): experiment to find optimal RD numbers.
  93 #define Y1_RD_MULT 4
  94 #define UV_RD_MULT 2
  95
  96 static const int plane_rd_mult[4] = {
  97   Y1_RD_MULT,
  98   UV_RD_MULT,
  99 };
 100
 101 #define UPDATE_RD_COST()\
 102 {\
 103   rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);\
 104   rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);\
 105   if (rd_cost0 == rd_cost1) {\
 106     rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);\
 107     rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);\
 108   }\
 109 }
 110
 111 // This function is a place holder for now but may ultimately need
 112 // to scan previous tokens to work out the correct context.
 113 static int trellis_get_coeff_context(const int16_t *scan,
 114                                      const int16_t *nb,
 115                                      int idx, int token,
 116                                      uint8_t *token_cache) {
 117   int bak = token_cache[scan[idx]], pt;
 118   token_cache[scan[idx]] = vp9_pt_energy_class[token];
 119   pt = get_coef_context(nb, token_cache, idx + 1);
 120   token_cache[scan[idx]] = bak;
 121   return pt;
 122 }
 123
 124 static void optimize_b(MACROBLOCK *mb,
 125                        int plane, int block, BLOCK_SIZE plane_bsize,
 126                        ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
 127                        TX_SIZE tx_size) {
 128   MACROBLOCKD *const xd = &mb->e_mbd;
 129   struct macroblock_plane *p = &mb->plane[plane];
 130   struct macroblockd_plane *pd = &xd->plane[plane];
 131   const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi);
 132   vp9_token_state tokens[1025][2];
 133   unsigned best_index[1025][2];
 134   const int16_t *coeff_ptr = BLOCK_OFFSET(mb->plane[plane].coeff, block);
 135   int16_t *qcoeff_ptr;
 136   int16_t *dqcoeff_ptr;
 137   int eob = p->eobs[block], final_eob, sz = 0;
 138   const int i0 = 0;
 139   int rc, x, next, i;
 140   int64_t rdmult, rddiv, rd_cost0, rd_cost1;
 141   int rate0, rate1, error0, error1, t0, t1;
 142   int best, band, pt;
 143   PLANE_TYPE type = pd->plane_type;
 144   int err_mult = plane_rd_mult[type];
 145   const int default_eob = 16 << (tx_size << 1);
 146
 147   const int mul = 1 + (tx_size == TX_32X32);
 148   uint8_t token_cache[1024];
 149   const int16_t *dequant_ptr = pd->dequant;
 150   const uint8_t *const band_translate = get_band_translate(tx_size);
 151   const scan_order *so = get_scan(xd, tx_size, type, block);
 152   const int16_t *scan = so->scan;
 153   const int16_t *nb = so->neighbors;
 154
 155   assert((!type && !plane) || (type && plane));
 156   dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block);
 157   qcoeff_ptr = BLOCK_OFFSET(p->qcoeff, block);
 158   assert(eob <= default_eob);
 159
 160   /* Now set up a Viterbi trellis to evaluate alternative roundings. */
 161   rdmult = mb->rdmult * err_mult;
 162   if (!is_inter_block(&mb->e_mbd.mi_8x8[0]->mbmi))
 163     rdmult = (rdmult * 9) >> 4;
 164   rddiv = mb->rddiv;
 165   /* Initialize the sentinel node of the trellis. */
 166   tokens[eob][0].rate = 0;
 167   tokens[eob][0].error = 0;
 168   tokens[eob][0].next = default_eob;
 169   tokens[eob][0].token = EOB_TOKEN;
 170   tokens[eob][0].qc = 0;
 171   *(tokens[eob] + 1) = *(tokens[eob] + 0);
 172   next = eob;
 173   for (i = 0; i < eob; i++)
 174     token_cache[scan[i]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[
 175         qcoeff_ptr[scan[i]]].token];
 176
 177   for (i = eob; i-- > i0;) {
 178     int base_bits, d2, dx;
 179
 180     rc = scan[i];
 181     x = qcoeff_ptr[rc];
 182     /* Only add a trellis state for non-zero coefficients. */
 183     if (x) {
 184       int shortcut = 0;
 185       error0 = tokens[next][0].error;
 186       error1 = tokens[next][1].error;
 187       /* Evaluate the first possibility for this state. */
 188       rate0 = tokens[next][0].rate;
 189       rate1 = tokens[next][1].rate;
 190       t0 = (vp9_dct_value_tokens_ptr + x)->token;
 191       /* Consider both possible successor states. */
 192       if (next < default_eob) {
 193         band = band_translate[i + 1];
 194         pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
 195         rate0 +=
 196           mb->token_costs[tx_size][type][ref][band][0][pt]
 197                          [tokens[next][0].token];
 198         rate1 +=
 199           mb->token_costs[tx_size][type][ref][band][0][pt]
 200                          [tokens[next][1].token];
 201       }
 202       UPDATE_RD_COST();
 203       /* And pick the best. */
 204       best = rd_cost1 < rd_cost0;
 205       base_bits = *(vp9_dct_value_cost_ptr + x);
 206       dx = mul * (dqcoeff_ptr[rc] - coeff_ptr[rc]);
 207       d2 = dx * dx;
 208       tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
 209       tokens[i][0].error = d2 + (best ? error1 : error0);
 210       tokens[i][0].next = next;
 211       tokens[i][0].token = t0;
 212       tokens[i][0].qc = x;
 213       best_index[i][0] = best;
 214
 215       /* Evaluate the second possibility for this state. */
 216       rate0 = tokens[next][0].rate;
 217       rate1 = tokens[next][1].rate;
 218
 219       if ((abs(x)*dequant_ptr[rc != 0] > abs(coeff_ptr[rc]) * mul) &&
 220           (abs(x)*dequant_ptr[rc != 0] < abs(coeff_ptr[rc]) * mul +
 221                                          dequant_ptr[rc != 0]))
 222         shortcut = 1;
 223       else
 224         shortcut = 0;
 225
 226       if (shortcut) {
 227         sz = -(x < 0);
 228         x -= 2 * sz + 1;
 229       }
 230
 231       /* Consider both possible successor states. */
 232       if (!x) {
 233         /* If we reduced this coefficient to zero, check to see if
 234          *  we need to move the EOB back here.
 235          */
 236         t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
 237         t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
 238       } else {
 239         t0 = t1 = (vp9_dct_value_tokens_ptr + x)->token;
 240       }
 241       if (next < default_eob) {
 242         band = band_translate[i + 1];
 243         if (t0 != EOB_TOKEN) {
 244           pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
 245           rate0 += mb->token_costs[tx_size][type][ref][band][!x][pt]
 246                                   [tokens[next][0].token];
 247         }
 248         if (t1 != EOB_TOKEN) {
 249           pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache);
 250           rate1 += mb->token_costs[tx_size][type][ref][band][!x][pt]
 251                                   [tokens[next][1].token];
 252         }
 253       }
 254
 255       UPDATE_RD_COST();
 256       /* And pick the best. */
 257       best = rd_cost1 < rd_cost0;
 258       base_bits = *(vp9_dct_value_cost_ptr + x);
 259
 260       if (shortcut) {
 261         dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
 262         d2 = dx * dx;
 263       }
 264       tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
 265       tokens[i][1].error = d2 + (best ? error1 : error0);
 266       tokens[i][1].next = next;
 267       tokens[i][1].token = best ? t1 : t0;
 268       tokens[i][1].qc = x;
 269       best_index[i][1] = best;
 270       /* Finally, make this the new head of the trellis. */
 271       next = i;
 272     } else {
 273       /* There's no choice to make for a zero coefficient, so we don't
 274        *  add a new trellis node, but we do need to update the costs.
 275        */
 276       band = band_translate[i + 1];
 277       t0 = tokens[next][0].token;
 278       t1 = tokens[next][1].token;
 279       /* Update the cost of each path if we're past the EOB token. */
 280       if (t0 != EOB_TOKEN) {
 281         tokens[next][0].rate +=
 282             mb->token_costs[tx_size][type][ref][band][1][0][t0];
 283         tokens[next][0].token = ZERO_TOKEN;
 284       }
 285       if (t1 != EOB_TOKEN) {
 286         tokens[next][1].rate +=
 287             mb->token_costs[tx_size][type][ref][band][1][0][t1];
 288         tokens[next][1].token = ZERO_TOKEN;
 289       }
 290       best_index[i][0] = best_index[i][1] = 0;
 291       /* Don't update next, because we didn't add a new node. */
 292     }
 293   }
 294
 295   /* Now pick the best path through the whole trellis. */
 296   band = band_translate[i + 1];
 297   pt = combine_entropy_contexts(*a, *l);
 298   rate0 = tokens[next][0].rate;
 299   rate1 = tokens[next][1].rate;
 300   error0 = tokens[next][0].error;
 301   error1 = tokens[next][1].error;
 302   t0 = tokens[next][0].token;
 303   t1 = tokens[next][1].token;
 304   rate0 += mb->token_costs[tx_size][type][ref][band][0][pt][t0];
 305   rate1 += mb->token_costs[tx_size][type][ref][band][0][pt][t1];
 306   UPDATE_RD_COST();
 307   best = rd_cost1 < rd_cost0;
 308   final_eob = i0 - 1;
 309   vpx_memset(qcoeff_ptr, 0, sizeof(*qcoeff_ptr) * (16 << (tx_size * 2)));
 310   vpx_memset(dqcoeff_ptr, 0, sizeof(*dqcoeff_ptr) * (16 << (tx_size * 2)));
 311   for (i = next; i < eob; i = next) {
 312     x = tokens[i][best].qc;
 313     if (x) {
 314       final_eob = i;
 315     }
 316     rc = scan[i];
 317     qcoeff_ptr[rc] = x;
 318     dqcoeff_ptr[rc] = (x * dequant_ptr[rc != 0]) / mul;
 319
 320     next = tokens[i][best].next;
 321     best = best_index[i][best];
 322   }
 323   final_eob++;
 324
 325   mb->plane[plane].eobs[block] = final_eob;
 326   *a = *l = (final_eob > 0);
 327 }
 328
 329 void vp9_optimize_b(int plane, int block, BLOCK_SIZE plane_bsize,
 330                     TX_SIZE tx_size, MACROBLOCK *mb, struct optimize_ctx *ctx) {
 331   int x, y;
 332   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
 333   optimize_b(mb, plane, block, plane_bsize,
 334              &ctx->ta[plane][x], &ctx->tl[plane][y], tx_size);
 335 }
 336
 337 static void optimize_init_b(int plane, BLOCK_SIZE bsize,
 338                             struct encode_b_args *args) {
 339   const MACROBLOCKD *xd = &args->x->e_mbd;
 340   const struct macroblockd_plane* const pd = &xd->plane[plane];
 341   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
 342   const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
 343   const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
 344   const MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
 345   const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size;
 346
 347   vp9_get_entropy_contexts(tx_size, args->ctx->ta[plane], args->ctx->tl[plane],
 348                            pd->above_context, pd->left_context,
 349                            num_4x4_w, num_4x4_h);
 350 }
 351 void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
 352                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
 353   MACROBLOCKD *const xd = &x->e_mbd;
 354   struct macroblock_plane *const p = &x->plane[plane];
 355   struct macroblockd_plane *const pd = &xd->plane[plane];
 356   int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
 357   int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
 358   int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
 359   const scan_order *scan_order;
 360   uint16_t *eob = &p->eobs[block];
 361   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
 362   int i, j;
 363   int16_t *src_diff;
 364   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
 365   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 366
 367   switch (tx_size) {
 368     case TX_32X32:
 369       scan_order = &vp9_default_scan_orders[TX_32X32];
 370       if (x->use_lp32x32fdct)
 371         vp9_fdct32x32_rd(src_diff, coeff, diff_stride);
 372       else
 373         vp9_fdct32x32(src_diff, coeff, diff_stride);
 374       vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
 375                            p->quant, p->quant_shift, qcoeff, dqcoeff,
 376                            pd->dequant, p->zbin_extra, eob, scan_order->scan,
 377                            scan_order->iscan);
 378       break;
 379     case TX_16X16:
 380       scan_order = &vp9_default_scan_orders[TX_16X16];
 381       vp9_fdct16x16(src_diff, coeff, diff_stride);
 382       vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
 383                      p->quant, p->quant_shift, qcoeff, dqcoeff,
 384                      pd->dequant, p->zbin_extra, eob,
 385                      scan_order->scan, scan_order->iscan);
 386       break;
 387     case TX_8X8:
 388       scan_order = &vp9_default_scan_orders[TX_8X8];
 389       vp9_fdct8x8(src_diff, coeff, diff_stride);
 390       vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
 391                      p->quant, p->quant_shift, qcoeff, dqcoeff,
 392                      pd->dequant, p->zbin_extra, eob,
 393                      scan_order->scan, scan_order->iscan);
 394       break;
 395     case TX_4X4:
 396       scan_order = &vp9_default_scan_orders[TX_4X4];
 397       x->fwd_txm4x4(src_diff, coeff, diff_stride);
 398       vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
 399                      p->quant, p->quant_shift, qcoeff, dqcoeff,
 400                      pd->dequant, p->zbin_extra, eob,
 401                      scan_order->scan, scan_order->iscan);
 402       break;
 403     default:
 404       assert(0);
 405   }
 406 }
 407
 408 static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
 409                          TX_SIZE tx_size, void *arg) {
 410   struct encode_b_args *const args = arg;
 411   MACROBLOCK *const x = args->x;
 412   MACROBLOCKD *const xd = &x->e_mbd;
 413   struct optimize_ctx *const ctx = args->ctx;
 414   struct macroblock_plane *const p = &x->plane[plane];
 415   struct macroblockd_plane *const pd = &xd->plane[plane];
 416   int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
 417   int i, j;
 418   uint8_t *dst;
 419   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
 420   dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
 421
 422   // TODO(jingning): per transformed block zero forcing only enabled for
 423   // luma component. will integrate chroma components as well.
 424   if (x->zcoeff_blk[tx_size][block] && plane == 0) {
 425     p->eobs[block] = 0;
 426     ctx->ta[plane][i] = 0;
 427     ctx->tl[plane][j] = 0;
 428     return;
 429   }
 430
 431   if (!x->skip_recode)
 432     vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
 433
 434   if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
 435     vp9_optimize_b(plane, block, plane_bsize, tx_size, x, ctx);
 436   } else {
 437     ctx->ta[plane][i] = p->eobs[block] > 0;
 438     ctx->tl[plane][j] = p->eobs[block] > 0;
 439   }
 440
 441   if (p->eobs[block])
 442     *(args->skip) = 0;
 443
 444   if (x->skip_encode || p->eobs[block] == 0)
 445     return;
 446
 447   switch (tx_size) {
 448     case TX_32X32:
 449       vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
 450       break;
 451     case TX_16X16:
 452       vp9_idct16x16_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
 453       break;
 454     case TX_8X8:
 455       vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
 456       break;
 457     case TX_4X4:
 458       // this is like vp9_short_idct4x4 but has a special case around eob<=1
 459       // which is significant (not just an optimization) for the lossless
 460       // case.
 461       xd->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
 462       break;
 463     default:
 464       assert(0 && "Invalid transform size");
 465   }
 466 }
 467 static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
 468                                TX_SIZE tx_size, void *arg) {
 469   struct encode_b_args *const args = arg;
 470   MACROBLOCK *const x = args->x;
 471   MACROBLOCKD *const xd = &x->e_mbd;
 472   struct macroblock_plane *const p = &x->plane[plane];
 473   struct macroblockd_plane *const pd = &xd->plane[plane];
 474   int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
 475   int i, j;
 476   uint8_t *dst;
 477   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
 478   dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
 479
 480   vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
 481
 482   if (p->eobs[block] == 0)
 483     return;
 484
 485   xd->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
 486 }
 487
 488 void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
 489   MACROBLOCKD *const xd = &x->e_mbd;
 490   struct optimize_ctx ctx;
 491   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
 492   struct encode_b_args arg = {x, &ctx, &mbmi->skip};
 493
 494   vp9_subtract_sby(x, bsize);
 495   if (x->optimize)
 496     optimize_init_b(0, bsize, &arg);
 497
 498   vp9_foreach_transformed_block_in_plane(xd, bsize, 0, encode_block_pass1,
 499                                          &arg);
 500 }
 501
 502 void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
 503   MACROBLOCKD *const xd = &x->e_mbd;
 504   struct optimize_ctx ctx;
 505   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
 506   struct encode_b_args arg = {x, &ctx, &mbmi->skip};
 507
 508   if (!x->skip_recode)
 509     vp9_subtract_sb(x, bsize);
 510
 511   if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
 512     int i;
 513     for (i = 0; i < MAX_MB_PLANE; ++i)
 514       optimize_init_b(i, bsize, &arg);
 515   }
 516
 517   vp9_foreach_transformed_block(xd, bsize, encode_block, &arg);
 518 }
 519
 520 static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
 521                                TX_SIZE tx_size, void *arg) {
 522   struct encode_b_args* const args = arg;
 523   MACROBLOCK *const x = args->x;
 524   MACROBLOCKD *const xd = &x->e_mbd;
 525   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
 526   struct macroblock_plane *const p = &x->plane[plane];
 527   struct macroblockd_plane *const pd = &xd->plane[plane];
 528   int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
 529   int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
 530   int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
 531   const scan_order *scan_order;
 532   TX_TYPE tx_type;
 533   MB_PREDICTION_MODE mode;
 534   const int bwl = b_width_log2(plane_bsize);
 535   const int diff_stride = 4 * (1 << bwl);
 536   uint8_t *src, *dst;
 537   int16_t *src_diff;
 538   uint16_t *eob = &p->eobs[block];
 539   int i, j;
 540   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
 541   dst = &pd->dst.buf[4 * (j * pd->dst.stride + i)];
 542   src = &p->src.buf[4 * (j * p->src.stride + i)];
 543   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 544
 545   // if (x->optimize)
 546   // vp9_optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx);
 547
 548   switch (tx_size) {
 549     case TX_32X32:
 550       scan_order = &vp9_default_scan_orders[TX_32X32];
 551       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
 552       vp9_predict_intra_block(xd, block >> 6, bwl, TX_32X32, mode,
 553                               x->skip_encode ? src : dst,
 554                               x->skip_encode ? p->src.stride : pd->dst.stride,
 555                               dst, pd->dst.stride, i, j, plane);
 556       if (!x->skip_recode) {
 557         vp9_subtract_block(32, 32, src_diff, diff_stride,
 558                            src, p->src.stride, dst, pd->dst.stride);
 559         if (x->use_lp32x32fdct)
 560           vp9_fdct32x32_rd(src_diff, coeff, diff_stride);
 561         else
 562           vp9_fdct32x32(src_diff, coeff, diff_stride);
 563         vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
 564                              p->quant, p->quant_shift, qcoeff, dqcoeff,
 565                              pd->dequant, p->zbin_extra, eob, scan_order->scan,
 566                              scan_order->iscan);
 567       }
 568       if (!x->skip_encode && *eob)
 569         vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, *eob);
 570       break;
 571     case TX_16X16:
 572       tx_type = get_tx_type_16x16(pd->plane_type, xd);
 573       scan_order = &vp9_scan_orders[TX_16X16][tx_type];
 574       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
 575       vp9_predict_intra_block(xd, block >> 4, bwl, TX_16X16, mode,
 576                               x->skip_encode ? src : dst,
 577                               x->skip_encode ? p->src.stride : pd->dst.stride,
 578                               dst, pd->dst.stride, i, j, plane);
 579       if (!x->skip_recode) {
 580         vp9_subtract_block(16, 16, src_diff, diff_stride,
 581                            src, p->src.stride, dst, pd->dst.stride);
 582         vp9_fht16x16(src_diff, coeff, diff_stride, tx_type);
 583         vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
 584                        p->quant, p->quant_shift, qcoeff, dqcoeff,
 585                        pd->dequant, p->zbin_extra, eob, scan_order->scan,
 586                        scan_order->iscan);
 587       }
 588       if (!x->skip_encode && *eob)
 589         vp9_iht16x16_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
 590       break;
 591     case TX_8X8:
 592       tx_type = get_tx_type_8x8(pd->plane_type, xd);
 593       scan_order = &vp9_scan_orders[TX_8X8][tx_type];
 594       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
 595       vp9_predict_intra_block(xd, block >> 2, bwl, TX_8X8, mode,
 596                               x->skip_encode ? src : dst,
 597                               x->skip_encode ? p->src.stride : pd->dst.stride,
 598                               dst, pd->dst.stride, i, j, plane);
 599       if (!x->skip_recode) {
 600         vp9_subtract_block(8, 8, src_diff, diff_stride,
 601                            src, p->src.stride, dst, pd->dst.stride);
 602         vp9_fht8x8(src_diff, coeff, diff_stride, tx_type);
 603         vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
 604                        p->quant_shift, qcoeff, dqcoeff,
 605                        pd->dequant, p->zbin_extra, eob, scan_order->scan,
 606                        scan_order->iscan);
 607       }
 608       if (!x->skip_encode && *eob)
 609         vp9_iht8x8_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
 610       break;
 611     case TX_4X4:
 612       tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
 613       scan_order = &vp9_scan_orders[TX_4X4][tx_type];
 614       if (mbmi->sb_type < BLOCK_8X8 && plane == 0)
 615         mode = xd->mi_8x8[0]->bmi[block].as_mode;
 616       else
 617         mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
 618
 619       vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
 620                               x->skip_encode ? src : dst,
 621                               x->skip_encode ? p->src.stride : pd->dst.stride,
 622                               dst, pd->dst.stride, i, j, plane);
 623
 624       if (!x->skip_recode) {
 625         vp9_subtract_block(4, 4, src_diff, diff_stride,
 626                            src, p->src.stride, dst, pd->dst.stride);
 627         if (tx_type != DCT_DCT)
 628           vp9_fht4x4(src_diff, coeff, diff_stride, tx_type);
 629         else
 630           x->fwd_txm4x4(src_diff, coeff, diff_stride);
 631         vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
 632                        p->quant_shift, qcoeff, dqcoeff,
 633                        pd->dequant, p->zbin_extra, eob, scan_order->scan,
 634                        scan_order->iscan);
 635       }
 636
 637       if (!x->skip_encode && *eob) {
 638         if (tx_type == DCT_DCT)
 639           // this is like vp9_short_idct4x4 but has a special case around eob<=1
 640           // which is significant (not just an optimization) for the lossless
 641           // case.
 642           xd->itxm_add(dqcoeff, dst, pd->dst.stride, *eob);
 643         else
 644           vp9_iht4x4_16_add(dqcoeff, dst, pd->dst.stride, tx_type);
 645       }
 646       break;
 647     default:
 648       assert(0);
 649   }
 650   if (*eob)
 651     *(args->skip) = 0;
 652 }
 653
 654 void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block,
 655                             BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
 656                             unsigned char *skip) {
 657   struct encode_b_args arg = {x, NULL, skip};
 658   encode_block_intra(plane, block, plane_bsize, tx_size, &arg);
 659 }
 660
 661
 662 void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
 663   const MACROBLOCKD *const xd = &x->e_mbd;
 664   struct encode_b_args arg = {x, NULL, &xd->mi_8x8[0]->mbmi.skip};
 665
 666   vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block_intra,
 667                                          &arg);
 668 }
 669
 670 int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred) {
 671   MB_MODE_INFO * mbmi = &x->e_mbd.mi_8x8[0]->mbmi;
 672   x->skip_encode = 0;
 673   mbmi->mode = DC_PRED;
 674   mbmi->ref_frame[0] = INTRA_FRAME;
 675   mbmi->tx_size = use_16x16_pred ? (mbmi->sb_type >= BLOCK_16X16 ? TX_16X16
 676                                                                  : TX_8X8)
 677                                    : TX_4X4;
 678   vp9_encode_intra_block_plane(x, mbmi->sb_type, 0);
 679   return vp9_get_mb_ss(x->plane[0].src_diff);
 680 }