1 // Copyright 2011 Google Inc. All Rights Reserved.
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
12 // Author: Skal (pascal.massimino@gmail.com)
17 #include "./vp8enci.h"
20 #define DO_TRELLIS_I4 1
21 #define DO_TRELLIS_I16 1 // not a huge gain, but ok at low bitrate.
22 #define DO_TRELLIS_UV 0 // disable trellis for UV. Risky. Not worth.
25 #define MID_ALPHA 64 // neutral value for susceptibility
26 #define MIN_ALPHA 30 // lowest usable value for susceptibility
27 #define MAX_ALPHA 100 // higher meaninful value for susceptibility
29 #define SNS_TO_DQ 0.9 // Scaling constant between the sns value and the QP
30 // power-law modulation. Must be strictly less than 1.
32 #define I4_PENALTY 4000 // Rate-penalty for quick i4/i16 decision
34 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
36 #if defined(__cplusplus) || defined(c_plusplus)
40 //------------------------------------------------------------------------------
42 static WEBP_INLINE int clip(int v, int m, int M) {
43 return v < m ? m : v > M ? M : v;
46 static const uint8_t kZigzag[16] = {
47 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
50 static const uint8_t kDcTable[128] = {
51 4, 5, 6, 7, 8, 9, 10, 10,
52 11, 12, 13, 14, 15, 16, 17, 17,
53 18, 19, 20, 20, 21, 21, 22, 22,
54 23, 23, 24, 25, 25, 26, 27, 28,
55 29, 30, 31, 32, 33, 34, 35, 36,
56 37, 37, 38, 39, 40, 41, 42, 43,
57 44, 45, 46, 46, 47, 48, 49, 50,
58 51, 52, 53, 54, 55, 56, 57, 58,
59 59, 60, 61, 62, 63, 64, 65, 66,
60 67, 68, 69, 70, 71, 72, 73, 74,
61 75, 76, 76, 77, 78, 79, 80, 81,
62 82, 83, 84, 85, 86, 87, 88, 89,
63 91, 93, 95, 96, 98, 100, 101, 102,
64 104, 106, 108, 110, 112, 114, 116, 118,
65 122, 124, 126, 128, 130, 132, 134, 136,
66 138, 140, 143, 145, 148, 151, 154, 157
69 static const uint16_t kAcTable[128] = {
70 4, 5, 6, 7, 8, 9, 10, 11,
71 12, 13, 14, 15, 16, 17, 18, 19,
72 20, 21, 22, 23, 24, 25, 26, 27,
73 28, 29, 30, 31, 32, 33, 34, 35,
74 36, 37, 38, 39, 40, 41, 42, 43,
75 44, 45, 46, 47, 48, 49, 50, 51,
76 52, 53, 54, 55, 56, 57, 58, 60,
77 62, 64, 66, 68, 70, 72, 74, 76,
78 78, 80, 82, 84, 86, 88, 90, 92,
79 94, 96, 98, 100, 102, 104, 106, 108,
80 110, 112, 114, 116, 119, 122, 125, 128,
81 131, 134, 137, 140, 143, 146, 149, 152,
82 155, 158, 161, 164, 167, 170, 173, 177,
83 181, 185, 189, 193, 197, 201, 205, 209,
84 213, 217, 221, 225, 229, 234, 239, 245,
85 249, 254, 259, 264, 269, 274, 279, 284
88 static const uint16_t kAcTable2[128] = {
89 8, 8, 9, 10, 12, 13, 15, 17,
90 18, 20, 21, 23, 24, 26, 27, 29,
91 31, 32, 34, 35, 37, 38, 40, 41,
92 43, 44, 46, 48, 49, 51, 52, 54,
93 55, 57, 58, 60, 62, 63, 65, 66,
94 68, 69, 71, 72, 74, 75, 77, 79,
95 80, 82, 83, 85, 86, 88, 89, 93,
96 96, 99, 102, 105, 108, 111, 114, 117,
97 120, 124, 127, 130, 133, 136, 139, 142,
98 145, 148, 151, 155, 158, 161, 164, 167,
99 170, 173, 176, 179, 184, 189, 193, 198,
100 203, 207, 212, 217, 221, 226, 230, 235,
101 240, 244, 249, 254, 258, 263, 268, 274,
102 280, 286, 292, 299, 305, 311, 317, 323,
103 330, 336, 342, 348, 354, 362, 370, 379,
104 385, 393, 401, 409, 416, 424, 432, 440
107 static const uint16_t kCoeffThresh[16] = {
114 // TODO(skal): tune more. Coeff thresholding?
115 static const uint8_t kBiasMatrices[3][16] = { // [3] = [luma-ac,luma-dc,chroma]
130 // Sharpening by (slightly) raising the hi-frequency coeffs (only for trellis).
131 // Hack-ish but helpful for mid-bitrate range. Use with care.
132 static const uint8_t kFreqSharpening[16] = {
139 //------------------------------------------------------------------------------
140 // Initialize quantization parameters in VP8Matrix
142 // Returns the average quantizer
143 static int ExpandMatrix(VP8Matrix* const m, int type) {
146 for (i = 2; i < 16; ++i) {
149 for (i = 0; i < 16; ++i) {
150 const int j = kZigzag[i];
151 const int bias = kBiasMatrices[type][j];
152 m->iq_[j] = (1 << QFIX) / m->q_[j];
153 m->bias_[j] = BIAS(bias);
154 // TODO(skal): tune kCoeffThresh[]
155 m->zthresh_[j] = ((256 /*+ kCoeffThresh[j]*/ - bias) * m->q_[j] + 127) >> 8;
156 m->sharpen_[j] = (kFreqSharpening[j] * m->q_[j]) >> 11;
159 return (sum + 8) >> 4;
162 static void SetupMatrices(VP8Encoder* enc) {
164 const int tlambda_scale =
165 (enc->method_ >= 4) ? enc->config_->sns_strength
167 const int num_segments = enc->segment_hdr_.num_segments_;
168 for (i = 0; i < num_segments; ++i) {
169 VP8SegmentInfo* const m = &enc->dqm_[i];
170 const int q = m->quant_;
172 m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
173 m->y1_.q_[1] = kAcTable[clip(q, 0, 127)];
175 m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
176 m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
178 m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
179 m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
181 q4 = ExpandMatrix(&m->y1_, 0);
182 q16 = ExpandMatrix(&m->y2_, 1);
183 quv = ExpandMatrix(&m->uv_, 2);
185 // TODO: Switch to kLambda*[] tables?
187 m->lambda_i4_ = (3 * q4 * q4) >> 7;
188 m->lambda_i16_ = (3 * q16 * q16);
189 m->lambda_uv_ = (3 * quv * quv) >> 6;
190 m->lambda_mode_ = (1 * q4 * q4) >> 7;
191 m->lambda_trellis_i4_ = (7 * q4 * q4) >> 3;
192 m->lambda_trellis_i16_ = (q16 * q16) >> 2;
193 m->lambda_trellis_uv_ = (quv *quv) << 1;
194 m->tlambda_ = (tlambda_scale * q4) >> 5;
199 //------------------------------------------------------------------------------
200 // Initialize filtering parameters
202 // Very small filter-strength values have close to no visual effect. So we can
203 // save a little decoding-CPU by turning filtering off for these.
204 #define FSTRENGTH_CUTOFF 3
206 static void SetupFilterStrength(VP8Encoder* const enc) {
208 const int level0 = enc->config_->filter_strength;
209 for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
210 // Segments with lower quantizer will be less filtered. TODO: tune (wrt SNS)
211 const int level = level0 * 256 * enc->dqm_[i].quant_ / 128;
212 const int f = level / (256 + enc->dqm_[i].beta_);
213 enc->dqm_[i].fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
215 // We record the initial strength (mainly for the case of 1-segment only).
216 enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
217 enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
218 enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
221 //------------------------------------------------------------------------------
223 // Note: if you change the values below, remember that the max range
224 // allowed by the syntax for DQ_UV is [-16,16].
225 #define MAX_DQ_UV (6)
226 #define MIN_DQ_UV (-4)
228 // We want to emulate jpeg-like behaviour where the expected "good" quality
229 // is around q=75. Internally, our "good" middle is around c=50. So we
230 // map accordingly using linear piece-wise function
231 static double QualityToCompression(double c) {
232 const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
233 // The file size roughly scales as pow(quantizer, 3.). Actually, the
234 // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
235 // in the mid-quant range. So we scale the compressibility inversely to
236 // this power-law: quant ~= compression ^ 1/3. This law holds well for
237 // low quant. Finer modelling for high-quant would make use of kAcTable[]
239 const double v = pow(linear_c, 1 / 3.);
243 static double QualityToJPEGCompression(double c, double alpha) {
244 // We map the complexity 'alpha' and quality setting 'c' to a compression
245 // exponent empirically matched to the compression curve of libjpeg6b.
246 // On average, the WebP output size will be roughly similar to that of a
247 // JPEG file compressed with same quality factor.
248 const double amin = 0.30;
249 const double amax = 0.85;
250 const double exp_min = 0.4;
251 const double exp_max = 0.9;
252 const double slope = (exp_min - exp_max) / (amax - amin);
253 // Linearly interpolate 'expn' from exp_min to exp_max
254 // in the [amin, amax] range.
255 const double expn = (alpha > amax) ? exp_min
256 : (alpha < amin) ? exp_max
257 : exp_max + slope * (alpha - amin);
258 const double v = pow(c, expn);
262 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
263 const VP8SegmentInfo* const S2) {
264 return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
267 static void SimplifySegments(VP8Encoder* const enc) {
268 int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
269 const int num_segments = enc->segment_hdr_.num_segments_;
270 int num_final_segments = 1;
272 for (s1 = 1; s1 < num_segments; ++s1) { // find similar segments
273 const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
275 // check if we already have similar segment
276 for (s2 = 0; s2 < num_final_segments; ++s2) {
277 const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
278 if (SegmentsAreEquivalent(S1, S2)) {
285 if (num_final_segments != s1) {
286 enc->dqm_[num_final_segments] = enc->dqm_[s1];
288 ++num_final_segments;
291 if (num_final_segments < num_segments) { // Remap
292 int i = enc->mb_w_ * enc->mb_h_;
293 while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
294 enc->segment_hdr_.num_segments_ = num_final_segments;
295 // Replicate the trailing segment infos (it's mostly cosmetics)
296 for (i = num_final_segments; i < num_segments; ++i) {
297 enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
302 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
304 int dq_uv_ac, dq_uv_dc;
305 const int num_segments = enc->segment_hdr_.num_segments_;
306 const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
307 const double Q = quality / 100.;
308 const double c_base = enc->config_->emulate_jpeg_size ?
309 QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
310 QualityToCompression(Q);
311 for (i = 0; i < num_segments; ++i) {
312 // We modulate the base coefficient to accommodate for the quantization
313 // susceptibility and allow denser segments to be quantized more.
314 const double expn = 1. - amp * enc->dqm_[i].alpha_;
315 const double c = pow(c_base, expn);
316 const int q = (int)(127. * (1. - c));
318 enc->dqm_[i].quant_ = clip(q, 0, 127);
321 // purely indicative in the bitstream (except for the 1-segment case)
322 enc->base_quant_ = enc->dqm_[0].quant_;
324 // fill-in values for the unused segments (required by the syntax)
325 for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
326 enc->dqm_[i].quant_ = enc->base_quant_;
329 // uv_alpha_ is normally spread around ~60. The useful range is
330 // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
331 // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
332 dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
333 / (MAX_ALPHA - MIN_ALPHA);
334 // we rescale by the user-defined strength of adaptation
335 dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
337 dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
338 // We also boost the dc-uv-quant a little, based on sns-strength, since
339 // U/V channels are quite more reactive to high quants (flat DC-blocks
340 // tend to appear, and are displeasant).
341 dq_uv_dc = -4 * enc->config_->sns_strength / 100;
342 dq_uv_dc = clip(dq_uv_dc, -15, 15); // 4bit-signed max allowed
344 enc->dq_y1_dc_ = 0; // TODO(skal): dq-lum
347 enc->dq_uv_dc_ = dq_uv_dc;
348 enc->dq_uv_ac_ = dq_uv_ac;
350 SetupFilterStrength(enc); // initialize segments' filtering, eventually
352 if (num_segments > 1) SimplifySegments(enc);
354 SetupMatrices(enc); // finalize quantization matrices
357 //------------------------------------------------------------------------------
358 // Form the predictions in cache
360 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
361 const int VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
362 const int VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
364 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
365 const int VP8I4ModeOffsets[NUM_BMODES] = {
366 I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
369 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
370 const VP8Encoder* const enc = it->enc_;
371 const uint8_t* const left = it->x_ ? enc->y_left_ : NULL;
372 const uint8_t* const top = it->y_ ? enc->y_top_ + it->x_ * 16 : NULL;
373 VP8EncPredLuma16(it->yuv_p_, left, top);
376 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
377 const VP8Encoder* const enc = it->enc_;
378 const uint8_t* const left = it->x_ ? enc->u_left_ : NULL;
379 const uint8_t* const top = it->y_ ? enc->uv_top_ + it->x_ * 16 : NULL;
380 VP8EncPredChroma8(it->yuv_p_, left, top);
383 void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
384 VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
387 //------------------------------------------------------------------------------
401 const int VP8Scan[16 + 4 + 4] = {
403 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS,
404 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS,
405 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS,
406 0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
408 0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U
409 8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V
412 //------------------------------------------------------------------------------
413 // Distortion measurement
415 static const uint16_t kWeightY[16] = {
416 38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
419 static const uint16_t kWeightTrellis[16] = {
421 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
430 // Init/Copy the common fields in score.
431 static void InitScore(VP8ModeScore* const rd) {
436 rd->score = MAX_COST;
439 static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
443 dst->nz = src->nz; // note that nz is not accumulated, but just copied.
444 dst->score = src->score;
447 static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
451 dst->nz |= src->nz; // here, new nz bits are accumulated.
452 dst->score += src->score;
455 //------------------------------------------------------------------------------
456 // Performs trellis-optimized quantization.
461 int prev; // best previous
463 int sign; // sign of coeff_i
464 score_t cost; // bit cost
465 score_t error; // distortion = sum of (|coeff_i| - level_i * Q_i)^2
466 int ctx; // context (only depends on 'level'. Could be spared.)
469 // If a coefficient was quantized to a value Q (using a neutral bias),
470 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
471 // We don't test negative values though.
472 #define MIN_DELTA 0 // how much lower level to try
473 #define MAX_DELTA 1 // how much higher
474 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
475 #define NODE(n, l) (nodes[(n) + 1][(l) + MIN_DELTA])
477 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
478 // TODO: incorporate the "* 256" in the tables?
479 rd->score = rd->R * lambda + 256 * (rd->D + rd->SD);
482 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
483 score_t distortion) {
484 return rate * lambda + 256 * distortion;
487 static int TrellisQuantizeBlock(const VP8EncIterator* const it,
488 int16_t in[16], int16_t out[16],
489 int ctx0, int coeff_type,
490 const VP8Matrix* const mtx,
492 ProbaArray* const last_costs = it->enc_->proba_.coeffs_[coeff_type];
493 CostArray* const costs = it->enc_->proba_.level_cost_[coeff_type];
494 const int first = (coeff_type == 0) ? 1 : 0;
495 Node nodes[17][NUM_NODES];
496 int best_path[3] = {-1, -1, -1}; // store best-last/best-level/best-previous
499 int last = first - 1;
505 const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
506 const int last_proba = last_costs[VP8EncBands[first]][ctx0][0];
508 // compute maximal distortion.
510 for (n = first; n < 16; ++n) {
511 const int j = kZigzag[n];
512 const int err = in[j] * in[j];
513 max_error += kWeightTrellis[j] * err;
514 if (err > thresh) last = n;
516 // we don't need to go inspect up to n = 16 coeffs. We can just go up
517 // to last + 1 (inclusive) without losing much.
518 if (last < 15) ++last;
520 // compute 'skip' score. This is the max score one can do.
521 cost = VP8BitCost(0, last_proba);
522 best_score = RDScoreTrellis(lambda, cost, max_error);
524 // initialize source node.
526 for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
528 NODE(n, m).error = max_error;
529 NODE(n, m).ctx = ctx0;
534 for (n = first; n <= last; ++n) {
535 const int j = kZigzag[n];
536 const int Q = mtx->q_[j];
537 const int iQ = mtx->iq_[j];
538 const int B = BIAS(0x00); // neutral bias
539 // note: it's important to take sign of the _original_ coeff,
540 // so we don't have to consider level < 0 afterward.
541 const int sign = (in[j] < 0);
542 int coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
544 if (coeff0 > 2047) coeff0 = 2047;
546 level0 = QUANTDIV(coeff0, iQ, B);
547 // test all alternate level values around level0.
548 for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
549 Node* const cur = &NODE(n, m);
550 int delta_error, new_error;
551 score_t cur_score = MAX_COST;
552 int level = level0 + m;
557 cur->ctx = (level == 0) ? 0 : (level == 1) ? 1 : 2;
558 if (level >= 2048 || level < 0) { // node is dead?
559 cur->cost = MAX_COST;
562 last_proba = last_costs[VP8EncBands[n + 1]][cur->ctx][0];
564 // Compute delta_error = how much coding this level will
565 // subtract as distortion to max_error
566 new_error = coeff0 - level * Q;
568 kWeightTrellis[j] * (coeff0 * coeff0 - new_error * new_error);
570 // Inspect all possible non-dead predecessors. Retain only the best one.
571 for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
572 const Node* const prev = &NODE(n - 1, p);
573 const int prev_ctx = prev->ctx;
574 const uint16_t* const tcost = costs[VP8EncBands[n]][prev_ctx];
575 const score_t total_error = prev->error - delta_error;
576 score_t cost, base_cost, score;
578 if (prev->cost >= MAX_COST) { // dead node?
582 // Base cost of both terminal/non-terminal
583 base_cost = prev->cost + VP8LevelCost(tcost, level);
585 // Examine node assuming it's a non-terminal one.
587 if (level && n < 15) {
588 cost += VP8BitCost(1, last_proba);
590 score = RDScoreTrellis(lambda, cost, total_error);
591 if (score < cur_score) {
594 cur->error = total_error;
598 // Now, record best terminal node (and thus best entry in the graph).
601 if (n < 15) cost += VP8BitCost(0, last_proba);
602 score = RDScoreTrellis(lambda, cost, total_error);
603 if (score < best_score) {
605 best_path[0] = n; // best eob position
606 best_path[1] = m; // best level
607 best_path[2] = p; // best predecessor
615 memset(in + first, 0, (16 - first) * sizeof(*in));
616 memset(out + first, 0, (16 - first) * sizeof(*out));
617 if (best_path[0] == -1) {
621 // Unwind the best path.
622 // Note: best-prev on terminal node is not necessarily equal to the
623 // best_prev for non-terminal. So we patch best_path[2] in.
625 best_node = best_path[1];
626 NODE(n, best_node).prev = best_path[2]; // force best-prev for terminal
629 for (; n >= first; --n) {
630 const Node* const node = &NODE(n, best_node);
631 const int j = kZigzag[n];
632 out[n] = node->sign ? -node->level : node->level;
633 nz |= (node->level != 0);
634 in[j] = out[n] * mtx->q_[j];
635 best_node = node->prev;
642 //------------------------------------------------------------------------------
643 // Performs: difference, transform, quantize, back-transform, add
644 // all at once. Output is the reconstructed block in *yuv_out, and the
645 // quantized levels in *levels.
647 static int ReconstructIntra16(VP8EncIterator* const it,
648 VP8ModeScore* const rd,
649 uint8_t* const yuv_out,
651 const VP8Encoder* const enc = it->enc_;
652 const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
653 const uint8_t* const src = it->yuv_in_ + Y_OFF;
654 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
657 int16_t tmp[16][16], dc_tmp[16];
659 for (n = 0; n < 16; ++n) {
660 VP8FTransform(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
662 VP8FTransformWHT(tmp[0], dc_tmp);
663 nz |= VP8EncQuantizeBlock(dc_tmp, rd->y_dc_levels, 0, &dqm->y2_) << 24;
665 if (DO_TRELLIS_I16 && it->do_trellis_) {
667 VP8IteratorNzToBytes(it);
668 for (y = 0, n = 0; y < 4; ++y) {
669 for (x = 0; x < 4; ++x, ++n) {
670 const int ctx = it->top_nz_[x] + it->left_nz_[y];
672 TrellisQuantizeBlock(it, tmp[n], rd->y_ac_levels[n], ctx, 0,
673 &dqm->y1_, dqm->lambda_trellis_i16_);
674 it->top_nz_[x] = it->left_nz_[y] = non_zero;
679 for (n = 0; n < 16; ++n) {
680 nz |= VP8EncQuantizeBlock(tmp[n], rd->y_ac_levels[n], 1, &dqm->y1_) << n;
685 VP8ITransformWHT(dc_tmp, tmp[0]);
686 for (n = 0; n < 16; n += 2) {
687 VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
693 static int ReconstructIntra4(VP8EncIterator* const it,
695 const uint8_t* const src,
696 uint8_t* const yuv_out,
698 const VP8Encoder* const enc = it->enc_;
699 const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
700 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
704 VP8FTransform(src, ref, tmp);
705 if (DO_TRELLIS_I4 && it->do_trellis_) {
706 const int x = it->i4_ & 3, y = it->i4_ >> 2;
707 const int ctx = it->top_nz_[x] + it->left_nz_[y];
708 nz = TrellisQuantizeBlock(it, tmp, levels, ctx, 3, &dqm->y1_,
709 dqm->lambda_trellis_i4_);
711 nz = VP8EncQuantizeBlock(tmp, levels, 0, &dqm->y1_);
713 VP8ITransform(ref, tmp, yuv_out, 0);
717 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
718 uint8_t* const yuv_out, int mode) {
719 const VP8Encoder* const enc = it->enc_;
720 const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
721 const uint8_t* const src = it->yuv_in_ + U_OFF;
722 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
727 for (n = 0; n < 8; ++n) {
728 VP8FTransform(src + VP8Scan[16 + n], ref + VP8Scan[16 + n], tmp[n]);
730 if (DO_TRELLIS_UV && it->do_trellis_) {
732 for (ch = 0, n = 0; ch <= 2; ch += 2) {
733 for (y = 0; y < 2; ++y) {
734 for (x = 0; x < 2; ++x, ++n) {
735 const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
737 TrellisQuantizeBlock(it, tmp[n], rd->uv_levels[n], ctx, 2,
738 &dqm->uv_, dqm->lambda_trellis_uv_);
739 it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
745 for (n = 0; n < 8; ++n) {
746 nz |= VP8EncQuantizeBlock(tmp[n], rd->uv_levels[n], 0, &dqm->uv_) << n;
750 for (n = 0; n < 8; n += 2) {
751 VP8ITransform(ref + VP8Scan[16 + n], tmp[n], yuv_out + VP8Scan[16 + n], 1);
756 //------------------------------------------------------------------------------
757 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
758 // Pick the mode is lower RD-cost = Rate + lamba * Distortion.
760 static void SwapPtr(uint8_t** a, uint8_t** b) {
761 uint8_t* const tmp = *a;
766 static void SwapOut(VP8EncIterator* const it) {
767 SwapPtr(&it->yuv_out_, &it->yuv_out2_);
770 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
771 const VP8Encoder* const enc = it->enc_;
772 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
773 const int lambda = dqm->lambda_i16_;
774 const int tlambda = dqm->tlambda_;
775 const uint8_t* const src = it->yuv_in_ + Y_OFF;
780 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
781 uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF; // scratch buffer
785 nz = ReconstructIntra16(it, &rd16, tmp_dst, mode);
788 rd16.D = VP8SSE16x16(src, tmp_dst);
789 rd16.SD = tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY))
791 rd16.R = VP8GetCostLuma16(it, &rd16);
792 rd16.R += VP8FixedCostsI16[mode];
794 // Since we always examine Intra16 first, we can overwrite *rd directly.
795 SetRDScore(lambda, &rd16);
796 if (mode == 0 || rd16.score < rd->score) {
797 CopyScore(rd, &rd16);
800 memcpy(rd->y_ac_levels, rd16.y_ac_levels, sizeof(rd16.y_ac_levels));
801 memcpy(rd->y_dc_levels, rd16.y_dc_levels, sizeof(rd16.y_dc_levels));
805 SetRDScore(dqm->lambda_mode_, rd); // finalize score for mode decision.
806 VP8SetIntra16Mode(it, rd->mode_i16);
809 //------------------------------------------------------------------------------
811 // return the cost array corresponding to the surrounding prediction modes.
812 static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
813 const uint8_t modes[16]) {
814 const int preds_w = it->enc_->preds_w_;
815 const int x = (it->i4_ & 3), y = it->i4_ >> 2;
816 const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
817 const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
818 return VP8FixedCostsI4[top][left];
821 static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
822 const VP8Encoder* const enc = it->enc_;
823 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
824 const int lambda = dqm->lambda_i4_;
825 const int tlambda = dqm->tlambda_;
826 const uint8_t* const src0 = it->yuv_in_ + Y_OFF;
827 uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF;
828 int total_header_bits = 0;
829 VP8ModeScore rd_best;
831 if (enc->max_i4_header_bits_ == 0) {
836 rd_best.score = 211; // '211' is the value of VP8BitCost(0, 145)
837 VP8IteratorStartI4(it);
842 const uint8_t* const src = src0 + VP8Scan[it->i4_];
843 const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
844 uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
845 uint8_t* tmp_dst = it->yuv_p_ + I4TMP; // scratch buffer.
848 VP8MakeIntra4Preds(it);
849 for (mode = 0; mode < NUM_BMODES; ++mode) {
851 int16_t tmp_levels[16];
855 ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
858 rd_tmp.D = VP8SSE4x4(src, tmp_dst);
860 tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
862 rd_tmp.R = VP8GetCostLuma4(it, tmp_levels);
863 rd_tmp.R += mode_costs[mode];
865 SetRDScore(lambda, &rd_tmp);
866 if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
867 CopyScore(&rd_i4, &rd_tmp);
869 SwapPtr(&tmp_dst, &best_block);
870 memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels, sizeof(tmp_levels));
873 SetRDScore(dqm->lambda_mode_, &rd_i4);
874 AddScore(&rd_best, &rd_i4);
875 total_header_bits += mode_costs[best_mode];
876 if (rd_best.score >= rd->score ||
877 total_header_bits > enc->max_i4_header_bits_) {
880 // Copy selected samples if not in the right place already.
881 if (best_block != best_blocks + VP8Scan[it->i4_])
882 VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
883 rd->modes_i4[it->i4_] = best_mode;
884 it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
885 } while (VP8IteratorRotateI4(it, best_blocks));
888 CopyScore(rd, &rd_best);
889 VP8SetIntra4Mode(it, rd->modes_i4);
891 memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
892 return 1; // select intra4x4 over intra16x16
895 //------------------------------------------------------------------------------
897 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
898 const VP8Encoder* const enc = it->enc_;
899 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
900 const int lambda = dqm->lambda_uv_;
901 const uint8_t* const src = it->yuv_in_ + U_OFF;
902 uint8_t* const tmp_dst = it->yuv_out2_ + U_OFF; // scratch buffer
903 uint8_t* const dst0 = it->yuv_out_ + U_OFF;
904 VP8ModeScore rd_best;
909 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
913 rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
916 rd_uv.D = VP8SSE16x8(src, tmp_dst);
917 rd_uv.SD = 0; // TODO: should we call TDisto? it tends to flatten areas.
918 rd_uv.R = VP8GetCostUV(it, &rd_uv);
919 rd_uv.R += VP8FixedCostsUV[mode];
921 SetRDScore(lambda, &rd_uv);
922 if (mode == 0 || rd_uv.score < rd_best.score) {
923 CopyScore(&rd_best, &rd_uv);
925 memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
926 memcpy(dst0, tmp_dst, UV_SIZE); // TODO: SwapUVOut() ?
929 VP8SetIntraUVMode(it, rd->mode_uv);
930 AddScore(rd, &rd_best);
933 //------------------------------------------------------------------------------
934 // Final reconstruction and quantization.
936 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
937 const VP8Encoder* const enc = it->enc_;
938 const int is_i16 = (it->mb_->type_ == 1);
942 nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF, it->preds_[0]);
944 VP8IteratorStartI4(it);
947 it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
948 const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
949 uint8_t* const dst = it->yuv_out_ + Y_OFF + VP8Scan[it->i4_];
950 VP8MakeIntra4Preds(it);
951 nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
952 src, dst, mode) << it->i4_;
953 } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF));
956 nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF, it->mb_->uv_mode_);
960 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
961 static void DistoRefine(VP8EncIterator* const it, int try_both_i4_i16) {
962 const int is_i16 = (it->mb_->type_ == 1);
963 score_t best_score = MAX_COST;
965 if (try_both_i4_i16 || is_i16) {
968 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
969 const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
970 const uint8_t* const src = it->yuv_in_ + Y_OFF;
971 const score_t score = VP8SSE16x16(src, ref);
972 if (score < best_score) {
977 VP8SetIntra16Mode(it, best_mode);
979 if (try_both_i4_i16 || !is_i16) {
980 uint8_t modes_i4[16];
981 // We don't evaluate the rate here, but just account for it through a
982 // constant penalty (i4 mode usually needs more bits compared to i16).
983 score_t score_i4 = (score_t)I4_PENALTY;
985 VP8IteratorStartI4(it);
988 int best_sub_mode = -1;
989 score_t best_sub_score = MAX_COST;
990 const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
992 // TODO(skal): we don't really need the prediction pixels here,
993 // but just the distortion against 'src'.
994 VP8MakeIntra4Preds(it);
995 for (mode = 0; mode < NUM_BMODES; ++mode) {
996 const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
997 const score_t score = VP8SSE4x4(src, ref);
998 if (score < best_sub_score) {
999 best_sub_mode = mode;
1000 best_sub_score = score;
1003 modes_i4[it->i4_] = best_sub_mode;
1004 score_i4 += best_sub_score;
1005 if (score_i4 >= best_score) break;
1006 } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF));
1007 if (score_i4 < best_score) {
1008 VP8SetIntra4Mode(it, modes_i4);
1013 //------------------------------------------------------------------------------
1016 int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
1017 VP8RDLevel rd_opt) {
1019 const int method = it->enc_->method_;
1023 // We can perform predictions for Luma16x16 and Chroma8x8 already.
1024 // Luma4x4 predictions needs to be done as-we-go.
1025 VP8MakeLuma16Preds(it);
1026 VP8MakeChroma8Preds(it);
1028 if (rd_opt > RD_OPT_NONE) {
1029 it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1030 PickBestIntra16(it, rd);
1032 PickBestIntra4(it, rd);
1035 if (rd_opt == RD_OPT_TRELLIS) { // finish off with trellis-optim now
1036 it->do_trellis_ = 1;
1037 SimpleQuantize(it, rd);
1040 // For method == 2, pick the best intra4/intra16 based on SSE (~tad slower).
1041 // For method <= 1, we refine intra4 or intra16 (but don't re-examine mode).
1042 DistoRefine(it, (method >= 2));
1043 SimpleQuantize(it, rd);
1045 is_skipped = (rd->nz == 0);
1046 VP8SetSkip(it, is_skipped);
1050 #if defined(__cplusplus) || defined(c_plusplus)