src/third_party/libwebp/enc/quant.c

   1 // Copyright 2011 Google Inc. All Rights Reserved.
   2 //
   3 // Use of this source code is governed by a BSD-style license
   4 // that can be found in the COPYING file in the root of the source
   5 // tree. An additional intellectual property rights grant can be found
   6 // in the file PATENTS. All contributing project authors may
   7 // be found in the AUTHORS file in the root of the source tree.
   8 // -----------------------------------------------------------------------------
   9 //
  10 //   Quantization
  11 //
  12 // Author: Skal (pascal.massimino@gmail.com)
  13
  14 #include <assert.h>
  15 #include <math.h>
  16
  17 #include "./vp8enci.h"
  18 #include "./cost.h"
  19
  20 #define DO_TRELLIS_I4  1
  21 #define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
  22 #define DO_TRELLIS_UV  0   // disable trellis for UV. Risky. Not worth.
  23 #define USE_TDISTO 1
  24
  25 #define MID_ALPHA 64      // neutral value for susceptibility
  26 #define MIN_ALPHA 30      // lowest usable value for susceptibility
  27 #define MAX_ALPHA 100     // higher meaninful value for susceptibility
  28
  29 #define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
  30                           // power-law modulation. Must be strictly less than 1.
  31
  32 #define I4_PENALTY 4000   // Rate-penalty for quick i4/i16 decision
  33
  34 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
  35
  36 #if defined(__cplusplus) || defined(c_plusplus)
  37 extern "C" {
  38 #endif
  39
  40 //------------------------------------------------------------------------------
  41
  42 static WEBP_INLINE int clip(int v, int m, int M) {
  43   return v < m ? m : v > M ? M : v;
  44 }
  45
  46 static const uint8_t kZigzag[16] = {
  47   0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
  48 };
  49
  50 static const uint8_t kDcTable[128] = {
  51   4,     5,   6,   7,   8,   9,  10,  10,
  52   11,   12,  13,  14,  15,  16,  17,  17,
  53   18,   19,  20,  20,  21,  21,  22,  22,
  54   23,   23,  24,  25,  25,  26,  27,  28,
  55   29,   30,  31,  32,  33,  34,  35,  36,
  56   37,   37,  38,  39,  40,  41,  42,  43,
  57   44,   45,  46,  46,  47,  48,  49,  50,
  58   51,   52,  53,  54,  55,  56,  57,  58,
  59   59,   60,  61,  62,  63,  64,  65,  66,
  60   67,   68,  69,  70,  71,  72,  73,  74,
  61   75,   76,  76,  77,  78,  79,  80,  81,
  62   82,   83,  84,  85,  86,  87,  88,  89,
  63   91,   93,  95,  96,  98, 100, 101, 102,
  64   104, 106, 108, 110, 112, 114, 116, 118,
  65   122, 124, 126, 128, 130, 132, 134, 136,
  66   138, 140, 143, 145, 148, 151, 154, 157
  67 };
  68
  69 static const uint16_t kAcTable[128] = {
  70   4,     5,   6,   7,   8,   9,  10,  11,
  71   12,   13,  14,  15,  16,  17,  18,  19,
  72   20,   21,  22,  23,  24,  25,  26,  27,
  73   28,   29,  30,  31,  32,  33,  34,  35,
  74   36,   37,  38,  39,  40,  41,  42,  43,
  75   44,   45,  46,  47,  48,  49,  50,  51,
  76   52,   53,  54,  55,  56,  57,  58,  60,
  77   62,   64,  66,  68,  70,  72,  74,  76,
  78   78,   80,  82,  84,  86,  88,  90,  92,
  79   94,   96,  98, 100, 102, 104, 106, 108,
  80   110, 112, 114, 116, 119, 122, 125, 128,
  81   131, 134, 137, 140, 143, 146, 149, 152,
  82   155, 158, 161, 164, 167, 170, 173, 177,
  83   181, 185, 189, 193, 197, 201, 205, 209,
  84   213, 217, 221, 225, 229, 234, 239, 245,
  85   249, 254, 259, 264, 269, 274, 279, 284
  86 };
  87
  88 static const uint16_t kAcTable2[128] = {
  89   8,     8,   9,  10,  12,  13,  15,  17,
  90   18,   20,  21,  23,  24,  26,  27,  29,
  91   31,   32,  34,  35,  37,  38,  40,  41,
  92   43,   44,  46,  48,  49,  51,  52,  54,
  93   55,   57,  58,  60,  62,  63,  65,  66,
  94   68,   69,  71,  72,  74,  75,  77,  79,
  95   80,   82,  83,  85,  86,  88,  89,  93,
  96   96,   99, 102, 105, 108, 111, 114, 117,
  97   120, 124, 127, 130, 133, 136, 139, 142,
  98   145, 148, 151, 155, 158, 161, 164, 167,
  99   170, 173, 176, 179, 184, 189, 193, 198,
 100   203, 207, 212, 217, 221, 226, 230, 235,
 101   240, 244, 249, 254, 258, 263, 268, 274,
 102   280, 286, 292, 299, 305, 311, 317, 323,
 103   330, 336, 342, 348, 354, 362, 370, 379,
 104   385, 393, 401, 409, 416, 424, 432, 440
 105 };
 106
 107 static const uint16_t kCoeffThresh[16] = {
 108   0,  10, 20, 30,
 109   10, 20, 30, 30,
 110   20, 30, 30, 30,
 111   30, 30, 30, 30
 112 };
 113
 114 // TODO(skal): tune more. Coeff thresholding?
 115 static const uint8_t kBiasMatrices[3][16] = {  // [3] = [luma-ac,luma-dc,chroma]
 116   { 96, 96, 96, 96,
 117     96, 96, 96, 96,
 118     96, 96, 96, 96,
 119     96, 96, 96, 96 },
 120   { 96, 96, 96, 96,
 121     96, 96, 96, 96,
 122     96, 96, 96, 96,
 123     96, 96, 96, 96 },
 124   { 96, 96, 96, 96,
 125     96, 96, 96, 96,
 126     96, 96, 96, 96,
 127     96, 96, 96, 96 }
 128 };
 129
 130 // Sharpening by (slightly) raising the hi-frequency coeffs (only for trellis).
 131 // Hack-ish but helpful for mid-bitrate range. Use with care.
 132 static const uint8_t kFreqSharpening[16] = {
 133   0,  30, 60, 90,
 134   30, 60, 90, 90,
 135   60, 90, 90, 90,
 136   90, 90, 90, 90
 137 };
 138
 139 //------------------------------------------------------------------------------
 140 // Initialize quantization parameters in VP8Matrix
 141
 142 // Returns the average quantizer
 143 static int ExpandMatrix(VP8Matrix* const m, int type) {
 144   int i;
 145   int sum = 0;
 146   for (i = 2; i < 16; ++i) {
 147     m->q_[i] = m->q_[1];
 148   }
 149   for (i = 0; i < 16; ++i) {
 150     const int j = kZigzag[i];
 151     const int bias = kBiasMatrices[type][j];
 152     m->iq_[j] = (1 << QFIX) / m->q_[j];
 153     m->bias_[j] = BIAS(bias);
 154     // TODO(skal): tune kCoeffThresh[]
 155     m->zthresh_[j] = ((256 /*+ kCoeffThresh[j]*/ - bias) * m->q_[j] + 127) >> 8;
 156     m->sharpen_[j] = (kFreqSharpening[j] * m->q_[j]) >> 11;
 157     sum += m->q_[j];
 158   }
 159   return (sum + 8) >> 4;
 160 }
 161
 162 static void SetupMatrices(VP8Encoder* enc) {
 163   int i;
 164   const int tlambda_scale =
 165     (enc->method_ >= 4) ? enc->config_->sns_strength
 166                         : 0;
 167   const int num_segments = enc->segment_hdr_.num_segments_;
 168   for (i = 0; i < num_segments; ++i) {
 169     VP8SegmentInfo* const m = &enc->dqm_[i];
 170     const int q = m->quant_;
 171     int q4, q16, quv;
 172     m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
 173     m->y1_.q_[1] = kAcTable[clip(q,                  0, 127)];
 174
 175     m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
 176     m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
 177
 178     m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
 179     m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
 180
 181     q4  = ExpandMatrix(&m->y1_, 0);
 182     q16 = ExpandMatrix(&m->y2_, 1);
 183     quv = ExpandMatrix(&m->uv_, 2);
 184
 185     // TODO: Switch to kLambda*[] tables?
 186     {
 187       m->lambda_i4_  = (3 * q4 * q4) >> 7;
 188       m->lambda_i16_ = (3 * q16 * q16);
 189       m->lambda_uv_  = (3 * quv * quv) >> 6;
 190       m->lambda_mode_    = (1 * q4 * q4) >> 7;
 191       m->lambda_trellis_i4_  = (7 * q4 * q4) >> 3;
 192       m->lambda_trellis_i16_ = (q16 * q16) >> 2;
 193       m->lambda_trellis_uv_  = (quv *quv) << 1;
 194       m->tlambda_            = (tlambda_scale * q4) >> 5;
 195     }
 196   }
 197 }
 198
 199 //------------------------------------------------------------------------------
 200 // Initialize filtering parameters
 201
 202 // Very small filter-strength values have close to no visual effect. So we can
 203 // save a little decoding-CPU by turning filtering off for these.
 204 #define FSTRENGTH_CUTOFF 3
 205
 206 static void SetupFilterStrength(VP8Encoder* const enc) {
 207   int i;
 208   const int level0 = enc->config_->filter_strength;
 209   for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
 210     // Segments with lower quantizer will be less filtered. TODO: tune (wrt SNS)
 211     const int level = level0 * 256 * enc->dqm_[i].quant_ / 128;
 212     const int f = level / (256 + enc->dqm_[i].beta_);
 213     enc->dqm_[i].fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
 214   }
 215   // We record the initial strength (mainly for the case of 1-segment only).
 216   enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
 217   enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
 218   enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
 219 }
 220
 221 //------------------------------------------------------------------------------
 222
 223 // Note: if you change the values below, remember that the max range
 224 // allowed by the syntax for DQ_UV is [-16,16].
 225 #define MAX_DQ_UV (6)
 226 #define MIN_DQ_UV (-4)
 227
 228 // We want to emulate jpeg-like behaviour where the expected "good" quality
 229 // is around q=75. Internally, our "good" middle is around c=50. So we
 230 // map accordingly using linear piece-wise function
 231 static double QualityToCompression(double c) {
 232   const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
 233   // The file size roughly scales as pow(quantizer, 3.). Actually, the
 234   // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
 235   // in the mid-quant range. So we scale the compressibility inversely to
 236   // this power-law: quant ~= compression ^ 1/3. This law holds well for
 237   // low quant. Finer modelling for high-quant would make use of kAcTable[]
 238   // more explicitly.
 239   const double v = pow(linear_c, 1 / 3.);
 240   return v;
 241 }
 242
 243 static double QualityToJPEGCompression(double c, double alpha) {
 244   // We map the complexity 'alpha' and quality setting 'c' to a compression
 245   // exponent empirically matched to the compression curve of libjpeg6b.
 246   // On average, the WebP output size will be roughly similar to that of a
 247   // JPEG file compressed with same quality factor.
 248   const double amin = 0.30;
 249   const double amax = 0.85;
 250   const double exp_min = 0.4;
 251   const double exp_max = 0.9;
 252   const double slope = (exp_min - exp_max) / (amax - amin);
 253   // Linearly interpolate 'expn' from exp_min to exp_max
 254   // in the [amin, amax] range.
 255   const double expn = (alpha > amax) ? exp_min
 256                     : (alpha < amin) ? exp_max
 257                     : exp_max + slope * (alpha - amin);
 258   const double v = pow(c, expn);
 259   return v;
 260 }
 261
 262 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
 263                                  const VP8SegmentInfo* const S2) {
 264   return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
 265 }
 266
 267 static void SimplifySegments(VP8Encoder* const enc) {
 268   int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
 269   const int num_segments = enc->segment_hdr_.num_segments_;
 270   int num_final_segments = 1;
 271   int s1, s2;
 272   for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
 273     const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
 274     int found = 0;
 275     // check if we already have similar segment
 276     for (s2 = 0; s2 < num_final_segments; ++s2) {
 277       const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
 278       if (SegmentsAreEquivalent(S1, S2)) {
 279         found = 1;
 280         break;
 281       }
 282     }
 283     map[s1] = s2;
 284     if (!found) {
 285       if (num_final_segments != s1) {
 286         enc->dqm_[num_final_segments] = enc->dqm_[s1];
 287       }
 288       ++num_final_segments;
 289     }
 290   }
 291   if (num_final_segments < num_segments) {  // Remap
 292     int i = enc->mb_w_ * enc->mb_h_;
 293     while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
 294     enc->segment_hdr_.num_segments_ = num_final_segments;
 295     // Replicate the trailing segment infos (it's mostly cosmetics)
 296     for (i = num_final_segments; i < num_segments; ++i) {
 297       enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
 298     }
 299   }
 300 }
 301
 302 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
 303   int i;
 304   int dq_uv_ac, dq_uv_dc;
 305   const int num_segments = enc->segment_hdr_.num_segments_;
 306   const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
 307   const double Q = quality / 100.;
 308   const double c_base = enc->config_->emulate_jpeg_size ?
 309       QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
 310       QualityToCompression(Q);
 311   for (i = 0; i < num_segments; ++i) {
 312     // We modulate the base coefficient to accommodate for the quantization
 313     // susceptibility and allow denser segments to be quantized more.
 314     const double expn = 1. - amp * enc->dqm_[i].alpha_;
 315     const double c = pow(c_base, expn);
 316     const int q = (int)(127. * (1. - c));
 317     assert(expn > 0.);
 318     enc->dqm_[i].quant_ = clip(q, 0, 127);
 319   }
 320
 321   // purely indicative in the bitstream (except for the 1-segment case)
 322   enc->base_quant_ = enc->dqm_[0].quant_;
 323
 324   // fill-in values for the unused segments (required by the syntax)
 325   for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
 326     enc->dqm_[i].quant_ = enc->base_quant_;
 327   }
 328
 329   // uv_alpha_ is normally spread around ~60. The useful range is
 330   // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
 331   // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
 332   dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
 333                                           / (MAX_ALPHA - MIN_ALPHA);
 334   // we rescale by the user-defined strength of adaptation
 335   dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
 336   // and make it safe.
 337   dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
 338   // We also boost the dc-uv-quant a little, based on sns-strength, since
 339   // U/V channels are quite more reactive to high quants (flat DC-blocks
 340   // tend to appear, and are displeasant).
 341   dq_uv_dc = -4 * enc->config_->sns_strength / 100;
 342   dq_uv_dc = clip(dq_uv_dc, -15, 15);   // 4bit-signed max allowed
 343
 344   enc->dq_y1_dc_ = 0;       // TODO(skal): dq-lum
 345   enc->dq_y2_dc_ = 0;
 346   enc->dq_y2_ac_ = 0;
 347   enc->dq_uv_dc_ = dq_uv_dc;
 348   enc->dq_uv_ac_ = dq_uv_ac;
 349
 350   SetupFilterStrength(enc);   // initialize segments' filtering, eventually
 351
 352   if (num_segments > 1) SimplifySegments(enc);
 353
 354   SetupMatrices(enc);         // finalize quantization matrices
 355 }
 356
 357 //------------------------------------------------------------------------------
 358 // Form the predictions in cache
 359
 360 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
 361 const int VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
 362 const int VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
 363
 364 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
 365 const int VP8I4ModeOffsets[NUM_BMODES] = {
 366   I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
 367 };
 368
 369 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
 370   const VP8Encoder* const enc = it->enc_;
 371   const uint8_t* const left = it->x_ ? enc->y_left_ : NULL;
 372   const uint8_t* const top = it->y_ ? enc->y_top_ + it->x_ * 16 : NULL;
 373   VP8EncPredLuma16(it->yuv_p_, left, top);
 374 }
 375
 376 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
 377   const VP8Encoder* const enc = it->enc_;
 378   const uint8_t* const left = it->x_ ? enc->u_left_ : NULL;
 379   const uint8_t* const top = it->y_ ? enc->uv_top_ + it->x_ * 16 : NULL;
 380   VP8EncPredChroma8(it->yuv_p_, left, top);
 381 }
 382
 383 void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
 384   VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
 385 }
 386
 387 //------------------------------------------------------------------------------
 388 // Quantize
 389
 390 // Layout:
 391 // +----+
 392 // |YYYY| 0
 393 // |YYYY| 4
 394 // |YYYY| 8
 395 // |YYYY| 12
 396 // +----+
 397 // |UUVV| 16
 398 // |UUVV| 20
 399 // +----+
 400
 401 const int VP8Scan[16 + 4 + 4] = {
 402   // Luma
 403   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
 404   0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
 405   0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
 406   0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
 407
 408   0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
 409   8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
 410 };
 411
 412 //------------------------------------------------------------------------------
 413 // Distortion measurement
 414
 415 static const uint16_t kWeightY[16] = {
 416   38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
 417 };
 418
 419 static const uint16_t kWeightTrellis[16] = {
 420 #if USE_TDISTO == 0
 421   16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
 422 #else
 423   30, 27, 19, 11,
 424   27, 24, 17, 10,
 425   19, 17, 12,  8,
 426   11, 10,  8,  6
 427 #endif
 428 };
 429
 430 // Init/Copy the common fields in score.
 431 static void InitScore(VP8ModeScore* const rd) {
 432   rd->D  = 0;
 433   rd->SD = 0;
 434   rd->R  = 0;
 435   rd->nz = 0;
 436   rd->score = MAX_COST;
 437 }
 438
 439 static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
 440   dst->D  = src->D;
 441   dst->SD = src->SD;
 442   dst->R  = src->R;
 443   dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
 444   dst->score = src->score;
 445 }
 446
 447 static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
 448   dst->D  += src->D;
 449   dst->SD += src->SD;
 450   dst->R  += src->R;
 451   dst->nz |= src->nz;     // here, new nz bits are accumulated.
 452   dst->score += src->score;
 453 }
 454
 455 //------------------------------------------------------------------------------
 456 // Performs trellis-optimized quantization.
 457
 458 // Trellis
 459
 460 typedef struct {
 461   int prev;        // best previous
 462   int level;       // level
 463   int sign;        // sign of coeff_i
 464   score_t cost;    // bit cost
 465   score_t error;   // distortion = sum of (|coeff_i| - level_i * Q_i)^2
 466   int ctx;         // context (only depends on 'level'. Could be spared.)
 467 } Node;
 468
 469 // If a coefficient was quantized to a value Q (using a neutral bias),
 470 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
 471 // We don't test negative values though.
 472 #define MIN_DELTA 0   // how much lower level to try
 473 #define MAX_DELTA 1   // how much higher
 474 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
 475 #define NODE(n, l) (nodes[(n) + 1][(l) + MIN_DELTA])
 476
 477 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
 478   // TODO: incorporate the "* 256" in the tables?
 479   rd->score = rd->R * lambda + 256 * (rd->D + rd->SD);
 480 }
 481
 482 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
 483                                           score_t distortion) {
 484   return rate * lambda + 256 * distortion;
 485 }
 486
 487 static int TrellisQuantizeBlock(const VP8EncIterator* const it,
 488                                 int16_t in[16], int16_t out[16],
 489                                 int ctx0, int coeff_type,
 490                                 const VP8Matrix* const mtx,
 491                                 int lambda) {
 492   ProbaArray* const last_costs = it->enc_->proba_.coeffs_[coeff_type];
 493   CostArray* const costs = it->enc_->proba_.level_cost_[coeff_type];
 494   const int first = (coeff_type == 0) ? 1 : 0;
 495   Node nodes[17][NUM_NODES];
 496   int best_path[3] = {-1, -1, -1};   // store best-last/best-level/best-previous
 497   score_t best_score;
 498   int best_node;
 499   int last = first - 1;
 500   int n, m, p, nz;
 501
 502   {
 503     score_t cost;
 504     score_t max_error;
 505     const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
 506     const int last_proba = last_costs[VP8EncBands[first]][ctx0][0];
 507
 508     // compute maximal distortion.
 509     max_error = 0;
 510     for (n = first; n < 16; ++n) {
 511       const int j  = kZigzag[n];
 512       const int err = in[j] * in[j];
 513       max_error += kWeightTrellis[j] * err;
 514       if (err > thresh) last = n;
 515     }
 516     // we don't need to go inspect up to n = 16 coeffs. We can just go up
 517     // to last + 1 (inclusive) without losing much.
 518     if (last < 15) ++last;
 519
 520     // compute 'skip' score. This is the max score one can do.
 521     cost = VP8BitCost(0, last_proba);
 522     best_score = RDScoreTrellis(lambda, cost, max_error);
 523
 524     // initialize source node.
 525     n = first - 1;
 526     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
 527       NODE(n, m).cost = 0;
 528       NODE(n, m).error = max_error;
 529       NODE(n, m).ctx = ctx0;
 530     }
 531   }
 532
 533   // traverse trellis.
 534   for (n = first; n <= last; ++n) {
 535     const int j  = kZigzag[n];
 536     const int Q  = mtx->q_[j];
 537     const int iQ = mtx->iq_[j];
 538     const int B = BIAS(0x00);     // neutral bias
 539     // note: it's important to take sign of the _original_ coeff,
 540     // so we don't have to consider level < 0 afterward.
 541     const int sign = (in[j] < 0);
 542     int coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
 543     int level0;
 544     if (coeff0 > 2047) coeff0 = 2047;
 545
 546     level0 = QUANTDIV(coeff0, iQ, B);
 547     // test all alternate level values around level0.
 548     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
 549       Node* const cur = &NODE(n, m);
 550       int delta_error, new_error;
 551       score_t cur_score = MAX_COST;
 552       int level = level0 + m;
 553       int last_proba;
 554
 555       cur->sign = sign;
 556       cur->level = level;
 557       cur->ctx = (level == 0) ? 0 : (level == 1) ? 1 : 2;
 558       if (level >= 2048 || level < 0) {   // node is dead?
 559         cur->cost = MAX_COST;
 560         continue;
 561       }
 562       last_proba = last_costs[VP8EncBands[n + 1]][cur->ctx][0];
 563
 564       // Compute delta_error = how much coding this level will
 565       // subtract as distortion to max_error
 566       new_error = coeff0 - level * Q;
 567       delta_error =
 568         kWeightTrellis[j] * (coeff0 * coeff0 - new_error * new_error);
 569
 570       // Inspect all possible non-dead predecessors. Retain only the best one.
 571       for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
 572         const Node* const prev = &NODE(n - 1, p);
 573         const int prev_ctx = prev->ctx;
 574         const uint16_t* const tcost = costs[VP8EncBands[n]][prev_ctx];
 575         const score_t total_error = prev->error - delta_error;
 576         score_t cost, base_cost, score;
 577
 578         if (prev->cost >= MAX_COST) {   // dead node?
 579           continue;
 580         }
 581
 582         // Base cost of both terminal/non-terminal
 583         base_cost = prev->cost + VP8LevelCost(tcost, level);
 584
 585         // Examine node assuming it's a non-terminal one.
 586         cost = base_cost;
 587         if (level && n < 15) {
 588           cost += VP8BitCost(1, last_proba);
 589         }
 590         score = RDScoreTrellis(lambda, cost, total_error);
 591         if (score < cur_score) {
 592           cur_score = score;
 593           cur->cost  = cost;
 594           cur->error = total_error;
 595           cur->prev  = p;
 596         }
 597
 598         // Now, record best terminal node (and thus best entry in the graph).
 599         if (level) {
 600           cost = base_cost;
 601           if (n < 15) cost += VP8BitCost(0, last_proba);
 602           score = RDScoreTrellis(lambda, cost, total_error);
 603           if (score < best_score) {
 604             best_score = score;
 605             best_path[0] = n;   // best eob position
 606             best_path[1] = m;   // best level
 607             best_path[2] = p;   // best predecessor
 608           }
 609         }
 610       }
 611     }
 612   }
 613
 614   // Fresh start
 615   memset(in + first, 0, (16 - first) * sizeof(*in));
 616   memset(out + first, 0, (16 - first) * sizeof(*out));
 617   if (best_path[0] == -1) {
 618     return 0;   // skip!
 619   }
 620
 621   // Unwind the best path.
 622   // Note: best-prev on terminal node is not necessarily equal to the
 623   // best_prev for non-terminal. So we patch best_path[2] in.
 624   n = best_path[0];
 625   best_node = best_path[1];
 626   NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
 627   nz = 0;
 628
 629   for (; n >= first; --n) {
 630     const Node* const node = &NODE(n, best_node);
 631     const int j = kZigzag[n];
 632     out[n] = node->sign ? -node->level : node->level;
 633     nz |= (node->level != 0);
 634     in[j] = out[n] * mtx->q_[j];
 635     best_node = node->prev;
 636   }
 637   return nz;
 638 }
 639
 640 #undef NODE
 641
 642 //------------------------------------------------------------------------------
 643 // Performs: difference, transform, quantize, back-transform, add
 644 // all at once. Output is the reconstructed block in *yuv_out, and the
 645 // quantized levels in *levels.
 646
 647 static int ReconstructIntra16(VP8EncIterator* const it,
 648                               VP8ModeScore* const rd,
 649                               uint8_t* const yuv_out,
 650                               int mode) {
 651   const VP8Encoder* const enc = it->enc_;
 652   const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
 653   const uint8_t* const src = it->yuv_in_ + Y_OFF;
 654   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
 655   int nz = 0;
 656   int n;
 657   int16_t tmp[16][16], dc_tmp[16];
 658
 659   for (n = 0; n < 16; ++n) {
 660     VP8FTransform(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
 661   }
 662   VP8FTransformWHT(tmp[0], dc_tmp);
 663   nz |= VP8EncQuantizeBlock(dc_tmp, rd->y_dc_levels, 0, &dqm->y2_) << 24;
 664
 665   if (DO_TRELLIS_I16 && it->do_trellis_) {
 666     int x, y;
 667     VP8IteratorNzToBytes(it);
 668     for (y = 0, n = 0; y < 4; ++y) {
 669       for (x = 0; x < 4; ++x, ++n) {
 670         const int ctx = it->top_nz_[x] + it->left_nz_[y];
 671         const int non_zero =
 672            TrellisQuantizeBlock(it, tmp[n], rd->y_ac_levels[n], ctx, 0,
 673                                 &dqm->y1_, dqm->lambda_trellis_i16_);
 674         it->top_nz_[x] = it->left_nz_[y] = non_zero;
 675         nz |= non_zero << n;
 676       }
 677     }
 678   } else {
 679     for (n = 0; n < 16; ++n) {
 680       nz |= VP8EncQuantizeBlock(tmp[n], rd->y_ac_levels[n], 1, &dqm->y1_) << n;
 681     }
 682   }
 683
 684   // Transform back
 685   VP8ITransformWHT(dc_tmp, tmp[0]);
 686   for (n = 0; n < 16; n += 2) {
 687     VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
 688   }
 689
 690   return nz;
 691 }
 692
 693 static int ReconstructIntra4(VP8EncIterator* const it,
 694                              int16_t levels[16],
 695                              const uint8_t* const src,
 696                              uint8_t* const yuv_out,
 697                              int mode) {
 698   const VP8Encoder* const enc = it->enc_;
 699   const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
 700   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
 701   int nz = 0;
 702   int16_t tmp[16];
 703
 704   VP8FTransform(src, ref, tmp);
 705   if (DO_TRELLIS_I4 && it->do_trellis_) {
 706     const int x = it->i4_ & 3, y = it->i4_ >> 2;
 707     const int ctx = it->top_nz_[x] + it->left_nz_[y];
 708     nz = TrellisQuantizeBlock(it, tmp, levels, ctx, 3, &dqm->y1_,
 709                               dqm->lambda_trellis_i4_);
 710   } else {
 711     nz = VP8EncQuantizeBlock(tmp, levels, 0, &dqm->y1_);
 712   }
 713   VP8ITransform(ref, tmp, yuv_out, 0);
 714   return nz;
 715 }
 716
 717 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
 718                          uint8_t* const yuv_out, int mode) {
 719   const VP8Encoder* const enc = it->enc_;
 720   const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
 721   const uint8_t* const src = it->yuv_in_ + U_OFF;
 722   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
 723   int nz = 0;
 724   int n;
 725   int16_t tmp[8][16];
 726
 727   for (n = 0; n < 8; ++n) {
 728     VP8FTransform(src + VP8Scan[16 + n], ref + VP8Scan[16 + n], tmp[n]);
 729   }
 730   if (DO_TRELLIS_UV && it->do_trellis_) {
 731     int ch, x, y;
 732     for (ch = 0, n = 0; ch <= 2; ch += 2) {
 733       for (y = 0; y < 2; ++y) {
 734         for (x = 0; x < 2; ++x, ++n) {
 735           const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
 736           const int non_zero =
 737             TrellisQuantizeBlock(it, tmp[n], rd->uv_levels[n], ctx, 2,
 738                                  &dqm->uv_, dqm->lambda_trellis_uv_);
 739           it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
 740           nz |= non_zero << n;
 741         }
 742       }
 743     }
 744   } else {
 745     for (n = 0; n < 8; ++n) {
 746       nz |= VP8EncQuantizeBlock(tmp[n], rd->uv_levels[n], 0, &dqm->uv_) << n;
 747     }
 748   }
 749
 750   for (n = 0; n < 8; n += 2) {
 751     VP8ITransform(ref + VP8Scan[16 + n], tmp[n], yuv_out + VP8Scan[16 + n], 1);
 752   }
 753   return (nz << 16);
 754 }
 755
 756 //------------------------------------------------------------------------------
 757 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
 758 // Pick the mode is lower RD-cost = Rate + lamba * Distortion.
 759
 760 static void SwapPtr(uint8_t** a, uint8_t** b) {
 761   uint8_t* const tmp = *a;
 762   *a = *b;
 763   *b = tmp;
 764 }
 765
 766 static void SwapOut(VP8EncIterator* const it) {
 767   SwapPtr(&it->yuv_out_, &it->yuv_out2_);
 768 }
 769
 770 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
 771   const VP8Encoder* const enc = it->enc_;
 772   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
 773   const int lambda = dqm->lambda_i16_;
 774   const int tlambda = dqm->tlambda_;
 775   const uint8_t* const src = it->yuv_in_ + Y_OFF;
 776   VP8ModeScore rd16;
 777   int mode;
 778
 779   rd->mode_i16 = -1;
 780   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
 781     uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF;  // scratch buffer
 782     int nz;
 783
 784     // Reconstruct
 785     nz = ReconstructIntra16(it, &rd16, tmp_dst, mode);
 786
 787     // Measure RD-score
 788     rd16.D = VP8SSE16x16(src, tmp_dst);
 789     rd16.SD = tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY))
 790             : 0;
 791     rd16.R = VP8GetCostLuma16(it, &rd16);
 792     rd16.R += VP8FixedCostsI16[mode];
 793
 794     // Since we always examine Intra16 first, we can overwrite *rd directly.
 795     SetRDScore(lambda, &rd16);
 796     if (mode == 0 || rd16.score < rd->score) {
 797       CopyScore(rd, &rd16);
 798       rd->mode_i16 = mode;
 799       rd->nz = nz;
 800       memcpy(rd->y_ac_levels, rd16.y_ac_levels, sizeof(rd16.y_ac_levels));
 801       memcpy(rd->y_dc_levels, rd16.y_dc_levels, sizeof(rd16.y_dc_levels));
 802       SwapOut(it);
 803     }
 804   }
 805   SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
 806   VP8SetIntra16Mode(it, rd->mode_i16);
 807 }
 808
 809 //------------------------------------------------------------------------------
 810
 811 // return the cost array corresponding to the surrounding prediction modes.
 812 static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
 813                                      const uint8_t modes[16]) {
 814   const int preds_w = it->enc_->preds_w_;
 815   const int x = (it->i4_ & 3), y = it->i4_ >> 2;
 816   const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
 817   const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
 818   return VP8FixedCostsI4[top][left];
 819 }
 820
 821 static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
 822   const VP8Encoder* const enc = it->enc_;
 823   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
 824   const int lambda = dqm->lambda_i4_;
 825   const int tlambda = dqm->tlambda_;
 826   const uint8_t* const src0 = it->yuv_in_ + Y_OFF;
 827   uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF;
 828   int total_header_bits = 0;
 829   VP8ModeScore rd_best;
 830
 831   if (enc->max_i4_header_bits_ == 0) {
 832     return 0;
 833   }
 834
 835   InitScore(&rd_best);
 836   rd_best.score = 211;  // '211' is the value of VP8BitCost(0, 145)
 837   VP8IteratorStartI4(it);
 838   do {
 839     VP8ModeScore rd_i4;
 840     int mode;
 841     int best_mode = -1;
 842     const uint8_t* const src = src0 + VP8Scan[it->i4_];
 843     const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
 844     uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
 845     uint8_t* tmp_dst = it->yuv_p_ + I4TMP;    // scratch buffer.
 846
 847     InitScore(&rd_i4);
 848     VP8MakeIntra4Preds(it);
 849     for (mode = 0; mode < NUM_BMODES; ++mode) {
 850       VP8ModeScore rd_tmp;
 851       int16_t tmp_levels[16];
 852
 853       // Reconstruct
 854       rd_tmp.nz =
 855           ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
 856
 857       // Compute RD-score
 858       rd_tmp.D = VP8SSE4x4(src, tmp_dst);
 859       rd_tmp.SD =
 860           tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
 861                   : 0;
 862       rd_tmp.R = VP8GetCostLuma4(it, tmp_levels);
 863       rd_tmp.R += mode_costs[mode];
 864
 865       SetRDScore(lambda, &rd_tmp);
 866       if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
 867         CopyScore(&rd_i4, &rd_tmp);
 868         best_mode = mode;
 869         SwapPtr(&tmp_dst, &best_block);
 870         memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels, sizeof(tmp_levels));
 871       }
 872     }
 873     SetRDScore(dqm->lambda_mode_, &rd_i4);
 874     AddScore(&rd_best, &rd_i4);
 875     total_header_bits += mode_costs[best_mode];
 876     if (rd_best.score >= rd->score ||
 877         total_header_bits > enc->max_i4_header_bits_) {
 878       return 0;
 879     }
 880     // Copy selected samples if not in the right place already.
 881     if (best_block != best_blocks + VP8Scan[it->i4_])
 882       VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
 883     rd->modes_i4[it->i4_] = best_mode;
 884     it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
 885   } while (VP8IteratorRotateI4(it, best_blocks));
 886
 887   // finalize state
 888   CopyScore(rd, &rd_best);
 889   VP8SetIntra4Mode(it, rd->modes_i4);
 890   SwapOut(it);
 891   memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
 892   return 1;   // select intra4x4 over intra16x16
 893 }
 894
 895 //------------------------------------------------------------------------------
 896
 897 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
 898   const VP8Encoder* const enc = it->enc_;
 899   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
 900   const int lambda = dqm->lambda_uv_;
 901   const uint8_t* const src = it->yuv_in_ + U_OFF;
 902   uint8_t* const tmp_dst = it->yuv_out2_ + U_OFF;  // scratch buffer
 903   uint8_t* const dst0 = it->yuv_out_ + U_OFF;
 904   VP8ModeScore rd_best;
 905   int mode;
 906
 907   rd->mode_uv = -1;
 908   InitScore(&rd_best);
 909   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
 910     VP8ModeScore rd_uv;
 911
 912     // Reconstruct
 913     rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
 914
 915     // Compute RD-score
 916     rd_uv.D  = VP8SSE16x8(src, tmp_dst);
 917     rd_uv.SD = 0;    // TODO: should we call TDisto? it tends to flatten areas.
 918     rd_uv.R  = VP8GetCostUV(it, &rd_uv);
 919     rd_uv.R += VP8FixedCostsUV[mode];
 920
 921     SetRDScore(lambda, &rd_uv);
 922     if (mode == 0 || rd_uv.score < rd_best.score) {
 923       CopyScore(&rd_best, &rd_uv);
 924       rd->mode_uv = mode;
 925       memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
 926       memcpy(dst0, tmp_dst, UV_SIZE);   //  TODO: SwapUVOut() ?
 927     }
 928   }
 929   VP8SetIntraUVMode(it, rd->mode_uv);
 930   AddScore(rd, &rd_best);
 931 }
 932
 933 //------------------------------------------------------------------------------
 934 // Final reconstruction and quantization.
 935
 936 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
 937   const VP8Encoder* const enc = it->enc_;
 938   const int is_i16 = (it->mb_->type_ == 1);
 939   int nz = 0;
 940
 941   if (is_i16) {
 942     nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF, it->preds_[0]);
 943   } else {
 944     VP8IteratorStartI4(it);
 945     do {
 946       const int mode =
 947           it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
 948       const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
 949       uint8_t* const dst = it->yuv_out_ + Y_OFF + VP8Scan[it->i4_];
 950       VP8MakeIntra4Preds(it);
 951       nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
 952                               src, dst, mode) << it->i4_;
 953     } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF));
 954   }
 955
 956   nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF, it->mb_->uv_mode_);
 957   rd->nz = nz;
 958 }
 959
 960 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
 961 static void DistoRefine(VP8EncIterator* const it, int try_both_i4_i16) {
 962   const int is_i16 = (it->mb_->type_ == 1);
 963   score_t best_score = MAX_COST;
 964
 965   if (try_both_i4_i16 || is_i16) {
 966     int mode;
 967     int best_mode = -1;
 968     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
 969       const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
 970       const uint8_t* const src = it->yuv_in_ + Y_OFF;
 971       const score_t score = VP8SSE16x16(src, ref);
 972       if (score < best_score) {
 973         best_mode = mode;
 974         best_score = score;
 975       }
 976     }
 977     VP8SetIntra16Mode(it, best_mode);
 978   }
 979   if (try_both_i4_i16 || !is_i16) {
 980     uint8_t modes_i4[16];
 981     // We don't evaluate the rate here, but just account for it through a
 982     // constant penalty (i4 mode usually needs more bits compared to i16).
 983     score_t score_i4 = (score_t)I4_PENALTY;
 984
 985     VP8IteratorStartI4(it);
 986     do {
 987       int mode;
 988       int best_sub_mode = -1;
 989       score_t best_sub_score = MAX_COST;
 990       const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
 991
 992       // TODO(skal): we don't really need the prediction pixels here,
 993       // but just the distortion against 'src'.
 994       VP8MakeIntra4Preds(it);
 995       for (mode = 0; mode < NUM_BMODES; ++mode) {
 996         const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
 997         const score_t score = VP8SSE4x4(src, ref);
 998         if (score < best_sub_score) {
 999           best_sub_mode = mode;
1000           best_sub_score = score;
1001         }
1002       }
1003       modes_i4[it->i4_] = best_sub_mode;
1004       score_i4 += best_sub_score;
1005       if (score_i4 >= best_score) break;
1006     } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF));
1007     if (score_i4 < best_score) {
1008       VP8SetIntra4Mode(it, modes_i4);
1009     }
1010   }
1011 }
1012
1013 //------------------------------------------------------------------------------
1014 // Entry point
1015
1016 int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
1017                 VP8RDLevel rd_opt) {
1018   int is_skipped;
1019   const int method = it->enc_->method_;
1020
1021   InitScore(rd);
1022
1023   // We can perform predictions for Luma16x16 and Chroma8x8 already.
1024   // Luma4x4 predictions needs to be done as-we-go.
1025   VP8MakeLuma16Preds(it);
1026   VP8MakeChroma8Preds(it);
1027
1028   if (rd_opt > RD_OPT_NONE) {
1029     it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1030     PickBestIntra16(it, rd);
1031     if (method >= 2) {
1032       PickBestIntra4(it, rd);
1033     }
1034     PickBestUV(it, rd);
1035     if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
1036       it->do_trellis_ = 1;
1037       SimpleQuantize(it, rd);
1038     }
1039   } else {
1040     // For method == 2, pick the best intra4/intra16 based on SSE (~tad slower).
1041     // For method <= 1, we refine intra4 or intra16 (but don't re-examine mode).
1042     DistoRefine(it, (method >= 2));
1043     SimpleQuantize(it, rd);
1044   }
1045   is_skipped = (rd->nz == 0);
1046   VP8SetSkip(it, is_skipped);
1047   return is_skipped;
1048 }
1049
1050 #if defined(__cplusplus) || defined(c_plusplus)
1051 }    // extern "C"
1052 #endif