src/third_party/libwebp/dsp/enc.c

   1 // Copyright 2011 Google Inc. All Rights Reserved.
   2 //
   3 // Use of this source code is governed by a BSD-style license
   4 // that can be found in the COPYING file in the root of the source
   5 // tree. An additional intellectual property rights grant can be found
   6 // in the file PATENTS. All contributing project authors may
   7 // be found in the AUTHORS file in the root of the source tree.
   8 // -----------------------------------------------------------------------------
   9 //
  10 // Speed-critical encoding functions.
  11 //
  12 // Author: Skal (pascal.massimino@gmail.com)
  13
  14 #include <stdlib.h>  // for abs()
  15 #include "./dsp.h"
  16 #include "../enc/vp8enci.h"
  17
  18 #if defined(__cplusplus) || defined(c_plusplus)
  19 extern "C" {
  20 #endif
  21
  22 static WEBP_INLINE uint8_t clip_8b(int v) {
  23   return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
  24 }
  25
  26 static WEBP_INLINE int clip_max(int v, int max) {
  27   return (v > max) ? max : v;
  28 }
  29
  30 //------------------------------------------------------------------------------
  31 // Compute susceptibility based on DCT-coeff histograms:
  32 // the higher, the "easier" the macroblock is to compress.
  33
  34 const int VP8DspScan[16 + 4 + 4] = {
  35   // Luma
  36   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
  37   0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
  38   0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
  39   0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
  40
  41   0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
  42   8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
  43 };
  44
  45 static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
  46                              int start_block, int end_block,
  47                              VP8Histogram* const histo) {
  48   int j;
  49   for (j = start_block; j < end_block; ++j) {
  50     int k;
  51     int16_t out[16];
  52
  53     VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
  54
  55     // Convert coefficients to bin.
  56     for (k = 0; k < 16; ++k) {
  57       const int v = abs(out[k]) >> 3;  // TODO(skal): add rounding?
  58       const int clipped_value = clip_max(v, MAX_COEFF_THRESH);
  59       histo->distribution[clipped_value]++;
  60     }
  61   }
  62 }
  63
  64 //------------------------------------------------------------------------------
  65 // run-time tables (~4k)
  66
  67 static uint8_t clip1[255 + 510 + 1];    // clips [-255,510] to [0,255]
  68
  69 // We declare this variable 'volatile' to prevent instruction reordering
  70 // and make sure it's set to true _last_ (so as to be thread-safe)
  71 static volatile int tables_ok = 0;
  72
  73 static void InitTables(void) {
  74   if (!tables_ok) {
  75     int i;
  76     for (i = -255; i <= 255 + 255; ++i) {
  77       clip1[255 + i] = clip_8b(i);
  78     }
  79     tables_ok = 1;
  80   }
  81 }
  82
  83
  84 //------------------------------------------------------------------------------
  85 // Transforms (Paragraph 14.4)
  86
  87 #define STORE(x, y, v) \
  88   dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
  89
  90 static const int kC1 = 20091 + (1 << 16);
  91 static const int kC2 = 35468;
  92 #define MUL(a, b) (((a) * (b)) >> 16)
  93
  94 static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
  95                                       uint8_t* dst) {
  96   int C[4 * 4], *tmp;
  97   int i;
  98   tmp = C;
  99   for (i = 0; i < 4; ++i) {    // vertical pass
 100     const int a = in[0] + in[8];
 101     const int b = in[0] - in[8];
 102     const int c = MUL(in[4], kC2) - MUL(in[12], kC1);
 103     const int d = MUL(in[4], kC1) + MUL(in[12], kC2);
 104     tmp[0] = a + d;
 105     tmp[1] = b + c;
 106     tmp[2] = b - c;
 107     tmp[3] = a - d;
 108     tmp += 4;
 109     in++;
 110   }
 111
 112   tmp = C;
 113   for (i = 0; i < 4; ++i) {    // horizontal pass
 114     const int dc = tmp[0] + 4;
 115     const int a =  dc +  tmp[8];
 116     const int b =  dc -  tmp[8];
 117     const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1);
 118     const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2);
 119     STORE(0, i, a + d);
 120     STORE(1, i, b + c);
 121     STORE(2, i, b - c);
 122     STORE(3, i, a - d);
 123     tmp++;
 124   }
 125 }
 126
 127 static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
 128                        int do_two) {
 129   ITransformOne(ref, in, dst);
 130   if (do_two) {
 131     ITransformOne(ref + 4, in + 16, dst + 4);
 132   }
 133 }
 134
 135 static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
 136   int i;
 137   int tmp[16];
 138   for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
 139     const int d0 = src[0] - ref[0];   // 9bit dynamic range ([-255,255])
 140     const int d1 = src[1] - ref[1];
 141     const int d2 = src[2] - ref[2];
 142     const int d3 = src[3] - ref[3];
 143     const int a0 = (d0 + d3);         // 10b                      [-510,510]
 144     const int a1 = (d1 + d2);
 145     const int a2 = (d1 - d2);
 146     const int a3 = (d0 - d3);
 147     tmp[0 + i * 4] = (a0 + a1) * 8;   // 14b                      [-8160,8160]
 148     tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 1812) >> 9;      // [-7536,7542]
 149     tmp[2 + i * 4] = (a0 - a1) * 8;
 150     tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 +  937) >> 9;
 151   }
 152   for (i = 0; i < 4; ++i) {
 153     const int a0 = (tmp[0 + i] + tmp[12 + i]);  // 15b
 154     const int a1 = (tmp[4 + i] + tmp[ 8 + i]);
 155     const int a2 = (tmp[4 + i] - tmp[ 8 + i]);
 156     const int a3 = (tmp[0 + i] - tmp[12 + i]);
 157     out[0 + i] = (a0 + a1 + 7) >> 4;            // 12b
 158     out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0);
 159     out[8 + i] = (a0 - a1 + 7) >> 4;
 160     out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
 161   }
 162 }
 163
 164 static void ITransformWHT(const int16_t* in, int16_t* out) {
 165   int tmp[16];
 166   int i;
 167   for (i = 0; i < 4; ++i) {
 168     const int a0 = in[0 + i] + in[12 + i];
 169     const int a1 = in[4 + i] + in[ 8 + i];
 170     const int a2 = in[4 + i] - in[ 8 + i];
 171     const int a3 = in[0 + i] - in[12 + i];
 172     tmp[0  + i] = a0 + a1;
 173     tmp[8  + i] = a0 - a1;
 174     tmp[4  + i] = a3 + a2;
 175     tmp[12 + i] = a3 - a2;
 176   }
 177   for (i = 0; i < 4; ++i) {
 178     const int dc = tmp[0 + i * 4] + 3;    // w/ rounder
 179     const int a0 = dc             + tmp[3 + i * 4];
 180     const int a1 = tmp[1 + i * 4] + tmp[2 + i * 4];
 181     const int a2 = tmp[1 + i * 4] - tmp[2 + i * 4];
 182     const int a3 = dc             - tmp[3 + i * 4];
 183     out[ 0] = (a0 + a1) >> 3;
 184     out[16] = (a3 + a2) >> 3;
 185     out[32] = (a0 - a1) >> 3;
 186     out[48] = (a3 - a2) >> 3;
 187     out += 64;
 188   }
 189 }
 190
 191 static void FTransformWHT(const int16_t* in, int16_t* out) {
 192   // input is 12b signed
 193   int16_t tmp[16];
 194   int i;
 195   for (i = 0; i < 4; ++i, in += 64) {
 196     const int a0 = (in[0 * 16] + in[2 * 16]);  // 13b
 197     const int a1 = (in[1 * 16] + in[3 * 16]);
 198     const int a2 = (in[1 * 16] - in[3 * 16]);
 199     const int a3 = (in[0 * 16] - in[2 * 16]);
 200     tmp[0 + i * 4] = a0 + a1;   // 14b
 201     tmp[1 + i * 4] = a3 + a2;
 202     tmp[2 + i * 4] = a3 - a2;
 203     tmp[3 + i * 4] = a0 - a1;
 204   }
 205   for (i = 0; i < 4; ++i) {
 206     const int a0 = (tmp[0 + i] + tmp[8 + i]);  // 15b
 207     const int a1 = (tmp[4 + i] + tmp[12+ i]);
 208     const int a2 = (tmp[4 + i] - tmp[12+ i]);
 209     const int a3 = (tmp[0 + i] - tmp[8 + i]);
 210     const int b0 = a0 + a1;    // 16b
 211     const int b1 = a3 + a2;
 212     const int b2 = a3 - a2;
 213     const int b3 = a0 - a1;
 214     out[ 0 + i] = b0 >> 1;     // 15b
 215     out[ 4 + i] = b1 >> 1;
 216     out[ 8 + i] = b2 >> 1;
 217     out[12 + i] = b3 >> 1;
 218   }
 219 }
 220
 221 #undef MUL
 222 #undef STORE
 223
 224 //------------------------------------------------------------------------------
 225 // Intra predictions
 226
 227 #define DST(x, y) dst[(x) + (y) * BPS]
 228
 229 static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
 230   int j;
 231   for (j = 0; j < size; ++j) {
 232     memset(dst + j * BPS, value, size);
 233   }
 234 }
 235
 236 static WEBP_INLINE void VerticalPred(uint8_t* dst,
 237                                      const uint8_t* top, int size) {
 238   int j;
 239   if (top) {
 240     for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
 241   } else {
 242     Fill(dst, 127, size);
 243   }
 244 }
 245
 246 static WEBP_INLINE void HorizontalPred(uint8_t* dst,
 247                                        const uint8_t* left, int size) {
 248   if (left) {
 249     int j;
 250     for (j = 0; j < size; ++j) {
 251       memset(dst + j * BPS, left[j], size);
 252     }
 253   } else {
 254     Fill(dst, 129, size);
 255   }
 256 }
 257
 258 static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
 259                                    const uint8_t* top, int size) {
 260   int y;
 261   if (left) {
 262     if (top) {
 263       const uint8_t* const clip = clip1 + 255 - left[-1];
 264       for (y = 0; y < size; ++y) {
 265         const uint8_t* const clip_table = clip + left[y];
 266         int x;
 267         for (x = 0; x < size; ++x) {
 268           dst[x] = clip_table[top[x]];
 269         }
 270         dst += BPS;
 271       }
 272     } else {
 273       HorizontalPred(dst, left, size);
 274     }
 275   } else {
 276     // true motion without left samples (hence: with default 129 value)
 277     // is equivalent to VE prediction where you just copy the top samples.
 278     // Note that if top samples are not available, the default value is
 279     // then 129, and not 127 as in the VerticalPred case.
 280     if (top) {
 281       VerticalPred(dst, top, size);
 282     } else {
 283       Fill(dst, 129, size);
 284     }
 285   }
 286 }
 287
 288 static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
 289                                const uint8_t* top,
 290                                int size, int round, int shift) {
 291   int DC = 0;
 292   int j;
 293   if (top) {
 294     for (j = 0; j < size; ++j) DC += top[j];
 295     if (left) {   // top and left present
 296       for (j = 0; j < size; ++j) DC += left[j];
 297     } else {      // top, but no left
 298       DC += DC;
 299     }
 300     DC = (DC + round) >> shift;
 301   } else if (left) {   // left but no top
 302     for (j = 0; j < size; ++j) DC += left[j];
 303     DC += DC;
 304     DC = (DC + round) >> shift;
 305   } else {   // no top, no left, nothing.
 306     DC = 0x80;
 307   }
 308   Fill(dst, DC, size);
 309 }
 310
 311 //------------------------------------------------------------------------------
 312 // Chroma 8x8 prediction (paragraph 12.2)
 313
 314 static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
 315                              const uint8_t* top) {
 316   // U block
 317   DCMode(C8DC8 + dst, left, top, 8, 8, 4);
 318   VerticalPred(C8VE8 + dst, top, 8);
 319   HorizontalPred(C8HE8 + dst, left, 8);
 320   TrueMotion(C8TM8 + dst, left, top, 8);
 321   // V block
 322   dst += 8;
 323   if (top) top += 8;
 324   if (left) left += 16;
 325   DCMode(C8DC8 + dst, left, top, 8, 8, 4);
 326   VerticalPred(C8VE8 + dst, top, 8);
 327   HorizontalPred(C8HE8 + dst, left, 8);
 328   TrueMotion(C8TM8 + dst, left, top, 8);
 329 }
 330
 331 //------------------------------------------------------------------------------
 332 // luma 16x16 prediction (paragraph 12.3)
 333
 334 static void Intra16Preds(uint8_t* dst,
 335                          const uint8_t* left, const uint8_t* top) {
 336   DCMode(I16DC16 + dst, left, top, 16, 16, 5);
 337   VerticalPred(I16VE16 + dst, top, 16);
 338   HorizontalPred(I16HE16 + dst, left, 16);
 339   TrueMotion(I16TM16 + dst, left, top, 16);
 340 }
 341
 342 //------------------------------------------------------------------------------
 343 // luma 4x4 prediction
 344
 345 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
 346 #define AVG2(a, b) (((a) + (b) + 1) >> 1)
 347
 348 static void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
 349   const uint8_t vals[4] = {
 350     AVG3(top[-1], top[0], top[1]),
 351     AVG3(top[ 0], top[1], top[2]),
 352     AVG3(top[ 1], top[2], top[3]),
 353     AVG3(top[ 2], top[3], top[4])
 354   };
 355   int i;
 356   for (i = 0; i < 4; ++i) {
 357     memcpy(dst + i * BPS, vals, 4);
 358   }
 359 }
 360
 361 static void HE4(uint8_t* dst, const uint8_t* top) {    // horizontal
 362   const int X = top[-1];
 363   const int I = top[-2];
 364   const int J = top[-3];
 365   const int K = top[-4];
 366   const int L = top[-5];
 367   *(uint32_t*)(dst + 0 * BPS) = 0x01010101U * AVG3(X, I, J);
 368   *(uint32_t*)(dst + 1 * BPS) = 0x01010101U * AVG3(I, J, K);
 369   *(uint32_t*)(dst + 2 * BPS) = 0x01010101U * AVG3(J, K, L);
 370   *(uint32_t*)(dst + 3 * BPS) = 0x01010101U * AVG3(K, L, L);
 371 }
 372
 373 static void DC4(uint8_t* dst, const uint8_t* top) {
 374   uint32_t dc = 4;
 375   int i;
 376   for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
 377   Fill(dst, dc >> 3, 4);
 378 }
 379
 380 static void RD4(uint8_t* dst, const uint8_t* top) {
 381   const int X = top[-1];
 382   const int I = top[-2];
 383   const int J = top[-3];
 384   const int K = top[-4];
 385   const int L = top[-5];
 386   const int A = top[0];
 387   const int B = top[1];
 388   const int C = top[2];
 389   const int D = top[3];
 390   DST(0, 3)                                     = AVG3(J, K, L);
 391   DST(0, 2) = DST(1, 3)                         = AVG3(I, J, K);
 392   DST(0, 1) = DST(1, 2) = DST(2, 3)             = AVG3(X, I, J);
 393   DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I);
 394   DST(1, 0) = DST(2, 1) = DST(3, 2)             = AVG3(B, A, X);
 395   DST(2, 0) = DST(3, 1)                         = AVG3(C, B, A);
 396   DST(3, 0)                                     = AVG3(D, C, B);
 397 }
 398
 399 static void LD4(uint8_t* dst, const uint8_t* top) {
 400   const int A = top[0];
 401   const int B = top[1];
 402   const int C = top[2];
 403   const int D = top[3];
 404   const int E = top[4];
 405   const int F = top[5];
 406   const int G = top[6];
 407   const int H = top[7];
 408   DST(0, 0)                                     = AVG3(A, B, C);
 409   DST(1, 0) = DST(0, 1)                         = AVG3(B, C, D);
 410   DST(2, 0) = DST(1, 1) = DST(0, 2)             = AVG3(C, D, E);
 411   DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
 412   DST(3, 1) = DST(2, 2) = DST(1, 3)             = AVG3(E, F, G);
 413   DST(3, 2) = DST(2, 3)                         = AVG3(F, G, H);
 414   DST(3, 3)                                     = AVG3(G, H, H);
 415 }
 416
 417 static void VR4(uint8_t* dst, const uint8_t* top) {
 418   const int X = top[-1];
 419   const int I = top[-2];
 420   const int J = top[-3];
 421   const int K = top[-4];
 422   const int A = top[0];
 423   const int B = top[1];
 424   const int C = top[2];
 425   const int D = top[3];
 426   DST(0, 0) = DST(1, 2) = AVG2(X, A);
 427   DST(1, 0) = DST(2, 2) = AVG2(A, B);
 428   DST(2, 0) = DST(3, 2) = AVG2(B, C);
 429   DST(3, 0)             = AVG2(C, D);
 430
 431   DST(0, 3) =             AVG3(K, J, I);
 432   DST(0, 2) =             AVG3(J, I, X);
 433   DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
 434   DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
 435   DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
 436   DST(3, 1) =             AVG3(B, C, D);
 437 }
 438
 439 static void VL4(uint8_t* dst, const uint8_t* top) {
 440   const int A = top[0];
 441   const int B = top[1];
 442   const int C = top[2];
 443   const int D = top[3];
 444   const int E = top[4];
 445   const int F = top[5];
 446   const int G = top[6];
 447   const int H = top[7];
 448   DST(0, 0) =             AVG2(A, B);
 449   DST(1, 0) = DST(0, 2) = AVG2(B, C);
 450   DST(2, 0) = DST(1, 2) = AVG2(C, D);
 451   DST(3, 0) = DST(2, 2) = AVG2(D, E);
 452
 453   DST(0, 1) =             AVG3(A, B, C);
 454   DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
 455   DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
 456   DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
 457               DST(3, 2) = AVG3(E, F, G);
 458               DST(3, 3) = AVG3(F, G, H);
 459 }
 460
 461 static void HU4(uint8_t* dst, const uint8_t* top) {
 462   const int I = top[-2];
 463   const int J = top[-3];
 464   const int K = top[-4];
 465   const int L = top[-5];
 466   DST(0, 0) =             AVG2(I, J);
 467   DST(2, 0) = DST(0, 1) = AVG2(J, K);
 468   DST(2, 1) = DST(0, 2) = AVG2(K, L);
 469   DST(1, 0) =             AVG3(I, J, K);
 470   DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
 471   DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
 472   DST(3, 2) = DST(2, 2) =
 473   DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
 474 }
 475
 476 static void HD4(uint8_t* dst, const uint8_t* top) {
 477   const int X = top[-1];
 478   const int I = top[-2];
 479   const int J = top[-3];
 480   const int K = top[-4];
 481   const int L = top[-5];
 482   const int A = top[0];
 483   const int B = top[1];
 484   const int C = top[2];
 485
 486   DST(0, 0) = DST(2, 1) = AVG2(I, X);
 487   DST(0, 1) = DST(2, 2) = AVG2(J, I);
 488   DST(0, 2) = DST(2, 3) = AVG2(K, J);
 489   DST(0, 3)             = AVG2(L, K);
 490
 491   DST(3, 0)             = AVG3(A, B, C);
 492   DST(2, 0)             = AVG3(X, A, B);
 493   DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
 494   DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
 495   DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
 496   DST(1, 3)             = AVG3(L, K, J);
 497 }
 498
 499 static void TM4(uint8_t* dst, const uint8_t* top) {
 500   int x, y;
 501   const uint8_t* const clip = clip1 + 255 - top[-1];
 502   for (y = 0; y < 4; ++y) {
 503     const uint8_t* const clip_table = clip + top[-2 - y];
 504     for (x = 0; x < 4; ++x) {
 505       dst[x] = clip_table[top[x]];
 506     }
 507     dst += BPS;
 508   }
 509 }
 510
 511 #undef DST
 512 #undef AVG3
 513 #undef AVG2
 514
 515 // Left samples are top[-5 .. -2], top_left is top[-1], top are
 516 // located at top[0..3], and top right is top[4..7]
 517 static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
 518   DC4(I4DC4 + dst, top);
 519   TM4(I4TM4 + dst, top);
 520   VE4(I4VE4 + dst, top);
 521   HE4(I4HE4 + dst, top);
 522   RD4(I4RD4 + dst, top);
 523   VR4(I4VR4 + dst, top);
 524   LD4(I4LD4 + dst, top);
 525   VL4(I4VL4 + dst, top);
 526   HD4(I4HD4 + dst, top);
 527   HU4(I4HU4 + dst, top);
 528 }
 529
 530 //------------------------------------------------------------------------------
 531 // Metric
 532
 533 static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
 534                               int w, int h) {
 535   int count = 0;
 536   int y, x;
 537   for (y = 0; y < h; ++y) {
 538     for (x = 0; x < w; ++x) {
 539       const int diff = (int)a[x] - b[x];
 540       count += diff * diff;
 541     }
 542     a += BPS;
 543     b += BPS;
 544   }
 545   return count;
 546 }
 547
 548 static int SSE16x16(const uint8_t* a, const uint8_t* b) {
 549   return GetSSE(a, b, 16, 16);
 550 }
 551 static int SSE16x8(const uint8_t* a, const uint8_t* b) {
 552   return GetSSE(a, b, 16, 8);
 553 }
 554 static int SSE8x8(const uint8_t* a, const uint8_t* b) {
 555   return GetSSE(a, b, 8, 8);
 556 }
 557 static int SSE4x4(const uint8_t* a, const uint8_t* b) {
 558   return GetSSE(a, b, 4, 4);
 559 }
 560
 561 //------------------------------------------------------------------------------
 562 // Texture distortion
 563 //
 564 // We try to match the spectral content (weighted) between source and
 565 // reconstructed samples.
 566
 567 // Hadamard transform
 568 // Returns the weighted sum of the absolute value of transformed coefficients.
 569 static int TTransform(const uint8_t* in, const uint16_t* w) {
 570   int sum = 0;
 571   int tmp[16];
 572   int i;
 573   // horizontal pass
 574   for (i = 0; i < 4; ++i, in += BPS) {
 575     const int a0 = in[0] + in[2];
 576     const int a1 = in[1] + in[3];
 577     const int a2 = in[1] - in[3];
 578     const int a3 = in[0] - in[2];
 579     tmp[0 + i * 4] = a0 + a1;
 580     tmp[1 + i * 4] = a3 + a2;
 581     tmp[2 + i * 4] = a3 - a2;
 582     tmp[3 + i * 4] = a0 - a1;
 583   }
 584   // vertical pass
 585   for (i = 0; i < 4; ++i, ++w) {
 586     const int a0 = tmp[0 + i] + tmp[8 + i];
 587     const int a1 = tmp[4 + i] + tmp[12+ i];
 588     const int a2 = tmp[4 + i] - tmp[12+ i];
 589     const int a3 = tmp[0 + i] - tmp[8 + i];
 590     const int b0 = a0 + a1;
 591     const int b1 = a3 + a2;
 592     const int b2 = a3 - a2;
 593     const int b3 = a0 - a1;
 594
 595     sum += w[ 0] * abs(b0);
 596     sum += w[ 4] * abs(b1);
 597     sum += w[ 8] * abs(b2);
 598     sum += w[12] * abs(b3);
 599   }
 600   return sum;
 601 }
 602
 603 static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
 604                     const uint16_t* const w) {
 605   const int sum1 = TTransform(a, w);
 606   const int sum2 = TTransform(b, w);
 607   return abs(sum2 - sum1) >> 5;
 608 }
 609
 610 static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
 611                       const uint16_t* const w) {
 612   int D = 0;
 613   int x, y;
 614   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
 615     for (x = 0; x < 16; x += 4) {
 616       D += Disto4x4(a + x + y, b + x + y, w);
 617     }
 618   }
 619   return D;
 620 }
 621
 622 //------------------------------------------------------------------------------
 623 // Quantization
 624 //
 625
 626 static const uint8_t kZigzag[16] = {
 627   0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
 628 };
 629
 630 // Simple quantization
 631 static int QuantizeBlock(int16_t in[16], int16_t out[16],
 632                          int n, const VP8Matrix* const mtx) {
 633   int last = -1;
 634   for (; n < 16; ++n) {
 635     const int j = kZigzag[n];
 636     const int sign = (in[j] < 0);
 637     const int coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
 638     if (coeff > mtx->zthresh_[j]) {
 639       const int Q = mtx->q_[j];
 640       const int iQ = mtx->iq_[j];
 641       const int B = mtx->bias_[j];
 642       out[n] = QUANTDIV(coeff, iQ, B);
 643       if (out[n] > MAX_LEVEL) out[n] = MAX_LEVEL;
 644       if (sign) out[n] = -out[n];
 645       in[j] = out[n] * Q;
 646       if (out[n]) last = n;
 647     } else {
 648       out[n] = 0;
 649       in[j] = 0;
 650     }
 651   }
 652   return (last >= 0);
 653 }
 654
 655 //------------------------------------------------------------------------------
 656 // Block copy
 657
 658 static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int size) {
 659   int y;
 660   for (y = 0; y < size; ++y) {
 661     memcpy(dst, src, size);
 662     src += BPS;
 663     dst += BPS;
 664   }
 665 }
 666
 667 static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); }
 668
 669 //------------------------------------------------------------------------------
 670 // Initialization
 671
 672 // Speed-critical function pointers. We have to initialize them to the default
 673 // implementations within VP8EncDspInit().
 674 VP8CHisto VP8CollectHistogram;
 675 VP8Idct VP8ITransform;
 676 VP8Fdct VP8FTransform;
 677 VP8WHT VP8ITransformWHT;
 678 VP8WHT VP8FTransformWHT;
 679 VP8Intra4Preds VP8EncPredLuma4;
 680 VP8IntraPreds VP8EncPredLuma16;
 681 VP8IntraPreds VP8EncPredChroma8;
 682 VP8Metric VP8SSE16x16;
 683 VP8Metric VP8SSE8x8;
 684 VP8Metric VP8SSE16x8;
 685 VP8Metric VP8SSE4x4;
 686 VP8WMetric VP8TDisto4x4;
 687 VP8WMetric VP8TDisto16x16;
 688 VP8QuantizeBlock VP8EncQuantizeBlock;
 689 VP8BlockCopy VP8Copy4x4;
 690
 691 extern void VP8EncDspInitSSE2(void);
 692 extern void VP8EncDspInitNEON(void);
 693
 694 void VP8EncDspInit(void) {
 695   InitTables();
 696
 697   // default C implementations
 698   VP8CollectHistogram = CollectHistogram;
 699   VP8ITransform = ITransform;
 700   VP8FTransform = FTransform;
 701   VP8ITransformWHT = ITransformWHT;
 702   VP8FTransformWHT = FTransformWHT;
 703   VP8EncPredLuma4 = Intra4Preds;
 704   VP8EncPredLuma16 = Intra16Preds;
 705   VP8EncPredChroma8 = IntraChromaPreds;
 706   VP8SSE16x16 = SSE16x16;
 707   VP8SSE8x8 = SSE8x8;
 708   VP8SSE16x8 = SSE16x8;
 709   VP8SSE4x4 = SSE4x4;
 710   VP8TDisto4x4 = Disto4x4;
 711   VP8TDisto16x16 = Disto16x16;
 712   VP8EncQuantizeBlock = QuantizeBlock;
 713   VP8Copy4x4 = Copy4x4;
 714
 715   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
 716   if (VP8GetCPUInfo) {
 717 #if defined(WEBP_USE_SSE2)
 718     if (VP8GetCPUInfo(kSSE2)) {
 719       VP8EncDspInitSSE2();
 720     }
 721 #elif defined(WEBP_USE_NEON)
 722     if (VP8GetCPUInfo(kNEON)) {
 723       VP8EncDspInitNEON();
 724     }
 725 #endif
 726   }
 727 }
 728
 729 #if defined(__cplusplus) || defined(c_plusplus)
 730 }    // extern "C"
 731 #endif