src/third_party/libwebp/dsp/lossless.c

   1 // Copyright 2012 Google Inc. All Rights Reserved.
   2 //
   3 // Use of this source code is governed by a BSD-style license
   4 // that can be found in the COPYING file in the root of the source
   5 // tree. An additional intellectual property rights grant can be found
   6 // in the file PATENTS. All contributing project authors may
   7 // be found in the AUTHORS file in the root of the source tree.
   8 // -----------------------------------------------------------------------------
   9 //
  10 // Image transforms and color space conversion methods for lossless decoder.
  11 //
  12 // Authors: Vikas Arora (vikaas.arora@gmail.com)
  13 //          Jyrki Alakuijala (jyrki@google.com)
  14 //          Urvang Joshi (urvang@google.com)
  15
  16 #include "./dsp.h"
  17
  18 #include <math.h>
  19 #include <stdlib.h>
  20 #include "../dec/vp8li.h"
  21 #include "../utils/endian_inl.h"
  22 #include "./lossless.h"
  23 #include "./yuv.h"
  24
  25 #define MAX_DIFF_COST (1e30f)
  26
  27 // lookup table for small values of log2(int)
  28 const float kLog2Table[LOG_LOOKUP_IDX_MAX] = {
  29   0.0000000000000000f, 0.0000000000000000f,
  30   1.0000000000000000f, 1.5849625007211560f,
  31   2.0000000000000000f, 2.3219280948873621f,
  32   2.5849625007211560f, 2.8073549220576041f,
  33   3.0000000000000000f, 3.1699250014423121f,
  34   3.3219280948873621f, 3.4594316186372973f,
  35   3.5849625007211560f, 3.7004397181410921f,
  36   3.8073549220576041f, 3.9068905956085187f,
  37   4.0000000000000000f, 4.0874628412503390f,
  38   4.1699250014423121f, 4.2479275134435852f,
  39   4.3219280948873626f, 4.3923174227787606f,
  40   4.4594316186372973f, 4.5235619560570130f,
  41   4.5849625007211560f, 4.6438561897747243f,
  42   4.7004397181410917f, 4.7548875021634682f,
  43   4.8073549220576037f, 4.8579809951275718f,
  44   4.9068905956085187f, 4.9541963103868749f,
  45   5.0000000000000000f, 5.0443941193584533f,
  46   5.0874628412503390f, 5.1292830169449663f,
  47   5.1699250014423121f, 5.2094533656289501f,
  48   5.2479275134435852f, 5.2854022188622487f,
  49   5.3219280948873626f, 5.3575520046180837f,
  50   5.3923174227787606f, 5.4262647547020979f,
  51   5.4594316186372973f, 5.4918530963296747f,
  52   5.5235619560570130f, 5.5545888516776376f,
  53   5.5849625007211560f, 5.6147098441152083f,
  54   5.6438561897747243f, 5.6724253419714951f,
  55   5.7004397181410917f, 5.7279204545631987f,
  56   5.7548875021634682f, 5.7813597135246599f,
  57   5.8073549220576037f, 5.8328900141647412f,
  58   5.8579809951275718f, 5.8826430493618415f,
  59   5.9068905956085187f, 5.9307373375628866f,
  60   5.9541963103868749f, 5.9772799234999167f,
  61   6.0000000000000000f, 6.0223678130284543f,
  62   6.0443941193584533f, 6.0660891904577720f,
  63   6.0874628412503390f, 6.1085244567781691f,
  64   6.1292830169449663f, 6.1497471195046822f,
  65   6.1699250014423121f, 6.1898245588800175f,
  66   6.2094533656289501f, 6.2288186904958804f,
  67   6.2479275134435852f, 6.2667865406949010f,
  68   6.2854022188622487f, 6.3037807481771030f,
  69   6.3219280948873626f, 6.3398500028846243f,
  70   6.3575520046180837f, 6.3750394313469245f,
  71   6.3923174227787606f, 6.4093909361377017f,
  72   6.4262647547020979f, 6.4429434958487279f,
  73   6.4594316186372973f, 6.4757334309663976f,
  74   6.4918530963296747f, 6.5077946401986963f,
  75   6.5235619560570130f, 6.5391588111080309f,
  76   6.5545888516776376f, 6.5698556083309478f,
  77   6.5849625007211560f, 6.5999128421871278f,
  78   6.6147098441152083f, 6.6293566200796094f,
  79   6.6438561897747243f, 6.6582114827517946f,
  80   6.6724253419714951f, 6.6865005271832185f,
  81   6.7004397181410917f, 6.7142455176661224f,
  82   6.7279204545631987f, 6.7414669864011464f,
  83   6.7548875021634682f, 6.7681843247769259f,
  84   6.7813597135246599f, 6.7944158663501061f,
  85   6.8073549220576037f, 6.8201789624151878f,
  86   6.8328900141647412f, 6.8454900509443747f,
  87   6.8579809951275718f, 6.8703647195834047f,
  88   6.8826430493618415f, 6.8948177633079437f,
  89   6.9068905956085187f, 6.9188632372745946f,
  90   6.9307373375628866f, 6.9425145053392398f,
  91   6.9541963103868749f, 6.9657842846620869f,
  92   6.9772799234999167f, 6.9886846867721654f,
  93   7.0000000000000000f, 7.0112272554232539f,
  94   7.0223678130284543f, 7.0334230015374501f,
  95   7.0443941193584533f, 7.0552824355011898f,
  96   7.0660891904577720f, 7.0768155970508308f,
  97   7.0874628412503390f, 7.0980320829605263f,
  98   7.1085244567781691f, 7.1189410727235076f,
  99   7.1292830169449663f, 7.1395513523987936f,
 100   7.1497471195046822f, 7.1598713367783890f,
 101   7.1699250014423121f, 7.1799090900149344f,
 102   7.1898245588800175f, 7.1996723448363644f,
 103   7.2094533656289501f, 7.2191685204621611f,
 104   7.2288186904958804f, 7.2384047393250785f,
 105   7.2479275134435852f, 7.2573878426926521f,
 106   7.2667865406949010f, 7.2761244052742375f,
 107   7.2854022188622487f, 7.2946207488916270f,
 108   7.3037807481771030f, 7.3128829552843557f,
 109   7.3219280948873626f, 7.3309168781146167f,
 110   7.3398500028846243f, 7.3487281542310771f,
 111   7.3575520046180837f, 7.3663222142458160f,
 112   7.3750394313469245f, 7.3837042924740519f,
 113   7.3923174227787606f, 7.4008794362821843f,
 114   7.4093909361377017f, 7.4178525148858982f,
 115   7.4262647547020979f, 7.4346282276367245f,
 116   7.4429434958487279f, 7.4512111118323289f,
 117   7.4594316186372973f, 7.4676055500829976f,
 118   7.4757334309663976f, 7.4838157772642563f,
 119   7.4918530963296747f, 7.4998458870832056f,
 120   7.5077946401986963f, 7.5156998382840427f,
 121   7.5235619560570130f, 7.5313814605163118f,
 122   7.5391588111080309f, 7.5468944598876364f,
 123   7.5545888516776376f, 7.5622424242210728f,
 124   7.5698556083309478f, 7.5774288280357486f,
 125   7.5849625007211560f, 7.5924570372680806f,
 126   7.5999128421871278f, 7.6073303137496104f,
 127   7.6147098441152083f, 7.6220518194563764f,
 128   7.6293566200796094f, 7.6366246205436487f,
 129   7.6438561897747243f, 7.6510516911789281f,
 130   7.6582114827517946f, 7.6653359171851764f,
 131   7.6724253419714951f, 7.6794800995054464f,
 132   7.6865005271832185f, 7.6934869574993252f,
 133   7.7004397181410917f, 7.7073591320808825f,
 134   7.7142455176661224f, 7.7210991887071855f,
 135   7.7279204545631987f, 7.7347096202258383f,
 136   7.7414669864011464f, 7.7481928495894605f,
 137   7.7548875021634682f, 7.7615512324444795f,
 138   7.7681843247769259f, 7.7747870596011736f,
 139   7.7813597135246599f, 7.7879025593914317f,
 140   7.7944158663501061f, 7.8008998999203047f,
 141   7.8073549220576037f, 7.8137811912170374f,
 142   7.8201789624151878f, 7.8265484872909150f,
 143   7.8328900141647412f, 7.8392037880969436f,
 144   7.8454900509443747f, 7.8517490414160571f,
 145   7.8579809951275718f, 7.8641861446542797f,
 146   7.8703647195834047f, 7.8765169465649993f,
 147   7.8826430493618415f, 7.8887432488982591f,
 148   7.8948177633079437f, 7.9008668079807486f,
 149   7.9068905956085187f, 7.9128893362299619f,
 150   7.9188632372745946f, 7.9248125036057812f,
 151   7.9307373375628866f, 7.9366379390025709f,
 152   7.9425145053392398f, 7.9483672315846778f,
 153   7.9541963103868749f, 7.9600019320680805f,
 154   7.9657842846620869f, 7.9715435539507719f,
 155   7.9772799234999167f, 7.9829935746943103f,
 156   7.9886846867721654f, 7.9943534368588577f
 157 };
 158
 159 const float kSLog2Table[LOG_LOOKUP_IDX_MAX] = {
 160   0.00000000f,    0.00000000f,  2.00000000f,   4.75488750f,
 161   8.00000000f,   11.60964047f,  15.50977500f,  19.65148445f,
 162   24.00000000f,  28.52932501f,  33.21928095f,  38.05374781f,
 163   43.01955001f,  48.10571634f,  53.30296891f,  58.60335893f,
 164   64.00000000f,  69.48686830f,  75.05865003f,  80.71062276f,
 165   86.43856190f,  92.23866588f,  98.10749561f,  104.04192499f,
 166   110.03910002f, 116.09640474f, 122.21143267f, 128.38196256f,
 167   134.60593782f, 140.88144886f, 147.20671787f, 153.58008562f,
 168   160.00000000f, 166.46500594f, 172.97373660f, 179.52490559f,
 169   186.11730005f, 192.74977453f, 199.42124551f, 206.13068654f,
 170   212.87712380f, 219.65963219f, 226.47733176f, 233.32938445f,
 171   240.21499122f, 247.13338933f, 254.08384998f, 261.06567603f,
 172   268.07820003f, 275.12078236f, 282.19280949f, 289.29369244f,
 173   296.42286534f, 303.57978409f, 310.76392512f, 317.97478424f,
 174   325.21187564f, 332.47473081f, 339.76289772f, 347.07593991f,
 175   354.41343574f, 361.77497759f, 369.16017124f, 376.56863518f,
 176   384.00000000f, 391.45390785f, 398.93001188f, 406.42797576f,
 177   413.94747321f, 421.48818752f, 429.04981119f, 436.63204548f,
 178   444.23460010f, 451.85719280f, 459.49954906f, 467.16140179f,
 179   474.84249102f, 482.54256363f, 490.26137307f, 497.99867911f,
 180   505.75424759f, 513.52785023f, 521.31926438f, 529.12827280f,
 181   536.95466351f, 544.79822957f, 552.65876890f, 560.53608414f,
 182   568.42998244f, 576.34027536f, 584.26677867f, 592.20931226f,
 183   600.16769996f, 608.14176943f, 616.13135206f, 624.13628279f,
 184   632.15640007f, 640.19154569f, 648.24156472f, 656.30630539f,
 185   664.38561898f, 672.47935976f, 680.58738488f, 688.70955430f,
 186   696.84573069f, 704.99577935f, 713.15956818f, 721.33696754f,
 187   729.52785023f, 737.73209140f, 745.94956849f, 754.18016116f,
 188   762.42375127f, 770.68022275f, 778.94946161f, 787.23135586f,
 189   795.52579543f, 803.83267219f, 812.15187982f, 820.48331383f,
 190   828.82687147f, 837.18245171f, 845.54995518f, 853.92928416f,
 191   862.32034249f, 870.72303558f, 879.13727036f, 887.56295522f,
 192   896.00000000f, 904.44831595f, 912.90781569f, 921.37841320f,
 193   929.86002376f, 938.35256392f, 946.85595152f, 955.37010560f,
 194   963.89494641f, 972.43039537f, 980.97637504f, 989.53280911f,
 195   998.09962237f, 1006.67674069f, 1015.26409097f, 1023.86160116f,
 196   1032.46920021f, 1041.08681805f, 1049.71438560f, 1058.35183469f,
 197   1066.99909811f, 1075.65610955f, 1084.32280357f, 1092.99911564f,
 198   1101.68498204f, 1110.38033993f, 1119.08512727f, 1127.79928282f,
 199   1136.52274614f, 1145.25545758f, 1153.99735821f, 1162.74838989f,
 200   1171.50849518f, 1180.27761738f, 1189.05570047f, 1197.84268914f,
 201   1206.63852876f, 1215.44316535f, 1224.25654560f, 1233.07861684f,
 202   1241.90932703f, 1250.74862473f, 1259.59645914f, 1268.45278005f,
 203   1277.31753781f, 1286.19068338f, 1295.07216828f, 1303.96194457f,
 204   1312.85996488f, 1321.76618236f, 1330.68055071f, 1339.60302413f,
 205   1348.53355734f, 1357.47210556f, 1366.41862452f, 1375.37307041f,
 206   1384.33539991f, 1393.30557020f, 1402.28353887f, 1411.26926400f,
 207   1420.26270412f, 1429.26381818f, 1438.27256558f, 1447.28890615f,
 208   1456.31280014f, 1465.34420819f, 1474.38309138f, 1483.42941118f,
 209   1492.48312945f, 1501.54420843f, 1510.61261078f, 1519.68829949f,
 210   1528.77123795f, 1537.86138993f, 1546.95871952f, 1556.06319119f,
 211   1565.17476976f, 1574.29342040f, 1583.41910860f, 1592.55180020f,
 212   1601.69146137f, 1610.83805860f, 1619.99155871f, 1629.15192882f,
 213   1638.31913637f, 1647.49314911f, 1656.67393509f, 1665.86146266f,
 214   1675.05570047f, 1684.25661744f, 1693.46418280f, 1702.67836605f,
 215   1711.89913698f, 1721.12646563f, 1730.36032233f, 1739.60067768f,
 216   1748.84750254f, 1758.10076802f, 1767.36044551f, 1776.62650662f,
 217   1785.89892323f, 1795.17766747f, 1804.46271172f, 1813.75402857f,
 218   1823.05159087f, 1832.35537170f, 1841.66534438f, 1850.98148244f,
 219   1860.30375965f, 1869.63214999f, 1878.96662767f, 1888.30716711f,
 220   1897.65374295f, 1907.00633003f, 1916.36490342f, 1925.72943838f,
 221   1935.09991037f, 1944.47629506f, 1953.85856831f, 1963.24670620f,
 222   1972.64068498f, 1982.04048108f, 1991.44607117f, 2000.85743204f,
 223   2010.27454072f, 2019.69737440f, 2029.12591044f, 2038.56012640f
 224 };
 225
 226 const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX] = {
 227   { 0, 0}, { 0, 0}, { 1, 0}, { 2, 0}, { 3, 0}, { 4, 1}, { 4, 1}, { 5, 1},
 228   { 5, 1}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 7, 2}, { 7, 2}, { 7, 2},
 229   { 7, 2}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3},
 230   { 8, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3},
 231   { 9, 3}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4},
 232   {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4},
 233   {10, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4},
 234   {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4},
 235   {11, 4}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
 236   {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
 237   {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
 238   {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
 239   {12, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
 240   {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
 241   {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
 242   {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
 243   {13, 5}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
 244   {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
 245   {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
 246   {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
 247   {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
 248   {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
 249   {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
 250   {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
 251   {14, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
 252   {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
 253   {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
 254   {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
 255   {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
 256   {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
 257   {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
 258   {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
 259   {15, 6}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
 260   {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
 261   {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
 262   {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
 263   {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
 264   {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
 265   {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
 266   {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
 267   {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
 268   {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
 269   {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
 270   {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
 271   {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
 272   {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
 273   {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
 274   {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
 275   {16, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
 276   {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
 277   {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
 278   {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
 279   {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
 280   {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
 281   {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
 282   {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
 283   {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
 284   {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
 285   {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
 286   {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
 287   {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
 288   {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
 289   {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
 290   {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
 291 };
 292
 293 const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
 294    0,  0,  0,  0,  0,  0,  1,  0,  1,  0,  1,  2,  3,  0,  1,  2,  3,
 295    0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
 296    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
 297    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
 298    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
 299   16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
 300    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
 301   16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
 302    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
 303   16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
 304   32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
 305   48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
 306    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
 307   16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
 308   32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
 309   48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
 310    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
 311   16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
 312   32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
 313   48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
 314   64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
 315   80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
 316   96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
 317   112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
 318   127,
 319    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
 320   16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
 321   32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
 322   48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
 323   64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
 324   80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
 325   96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
 326   112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126
 327 };
 328
 329 // The threshold till approximate version of log_2 can be used.
 330 // Practically, we can get rid of the call to log() as the two values match to
 331 // very high degree (the ratio of these two is 0.99999x).
 332 // Keeping a high threshold for now.
 333 #define APPROX_LOG_WITH_CORRECTION_MAX  65536
 334 #define APPROX_LOG_MAX                   4096
 335 #define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
 336 static float FastSLog2Slow(uint32_t v) {
 337   assert(v >= LOG_LOOKUP_IDX_MAX);
 338   if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
 339     int log_cnt = 0;
 340     uint32_t y = 1;
 341     int correction = 0;
 342     const float v_f = (float)v;
 343     const uint32_t orig_v = v;
 344     do {
 345       ++log_cnt;
 346       v = v >> 1;
 347       y = y << 1;
 348     } while (v >= LOG_LOOKUP_IDX_MAX);
 349     // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256
 350     // Xf = floor(Xf) * (1 + (v % y) / v)
 351     // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
 352     // The correction factor: log(1 + d) ~ d; for very small d values, so
 353     // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
 354     // LOG_2_RECIPROCAL ~ 23/16
 355     correction = (23 * (orig_v & (y - 1))) >> 4;
 356     return v_f * (kLog2Table[v] + log_cnt) + correction;
 357   } else {
 358     return (float)(LOG_2_RECIPROCAL * v * log((double)v));
 359   }
 360 }
 361
 362 static float FastLog2Slow(uint32_t v) {
 363   assert(v >= LOG_LOOKUP_IDX_MAX);
 364   if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
 365     int log_cnt = 0;
 366     uint32_t y = 1;
 367     const uint32_t orig_v = v;
 368     double log_2;
 369     do {
 370       ++log_cnt;
 371       v = v >> 1;
 372       y = y << 1;
 373     } while (v >= LOG_LOOKUP_IDX_MAX);
 374     log_2 = kLog2Table[v] + log_cnt;
 375     if (orig_v >= APPROX_LOG_MAX) {
 376       // Since the division is still expensive, add this correction factor only
 377       // for large values of 'v'.
 378       const int correction = (23 * (orig_v & (y - 1))) >> 4;
 379       log_2 += (double)correction / orig_v;
 380     }
 381     return (float)log_2;
 382   } else {
 383     return (float)(LOG_2_RECIPROCAL * log((double)v));
 384   }
 385 }
 386
 387 //------------------------------------------------------------------------------
 388 // Image transforms.
 389
 390 // Mostly used to reduce code size + readability
 391 static WEBP_INLINE int GetMin(int a, int b) { return (a > b) ? b : a; }
 392
 393 // In-place sum of each component with mod 256.
 394 static WEBP_INLINE void AddPixelsEq(uint32_t* a, uint32_t b) {
 395   const uint32_t alpha_and_green = (*a & 0xff00ff00u) + (b & 0xff00ff00u);
 396   const uint32_t red_and_blue = (*a & 0x00ff00ffu) + (b & 0x00ff00ffu);
 397   *a = (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
 398 }
 399
 400 static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
 401   return (((a0 ^ a1) & 0xfefefefeL) >> 1) + (a0 & a1);
 402 }
 403
 404 static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
 405   return Average2(Average2(a0, a2), a1);
 406 }
 407
 408 static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
 409                                      uint32_t a2, uint32_t a3) {
 410   return Average2(Average2(a0, a1), Average2(a2, a3));
 411 }
 412
 413 static WEBP_INLINE uint32_t Clip255(uint32_t a) {
 414   if (a < 256) {
 415     return a;
 416   }
 417   // return 0, when a is a negative integer.
 418   // return 255, when a is positive.
 419   return ~a >> 24;
 420 }
 421
 422 static WEBP_INLINE int AddSubtractComponentFull(int a, int b, int c) {
 423   return Clip255(a + b - c);
 424 }
 425
 426 static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
 427                                                    uint32_t c2) {
 428   const int a = AddSubtractComponentFull(c0 >> 24, c1 >> 24, c2 >> 24);
 429   const int r = AddSubtractComponentFull((c0 >> 16) & 0xff,
 430                                          (c1 >> 16) & 0xff,
 431                                          (c2 >> 16) & 0xff);
 432   const int g = AddSubtractComponentFull((c0 >> 8) & 0xff,
 433                                          (c1 >> 8) & 0xff,
 434                                          (c2 >> 8) & 0xff);
 435   const int b = AddSubtractComponentFull(c0 & 0xff, c1 & 0xff, c2 & 0xff);
 436   return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
 437 }
 438
 439 static WEBP_INLINE int AddSubtractComponentHalf(int a, int b) {
 440   return Clip255(a + (a - b) / 2);
 441 }
 442
 443 static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
 444                                                    uint32_t c2) {
 445   const uint32_t ave = Average2(c0, c1);
 446   const int a = AddSubtractComponentHalf(ave >> 24, c2 >> 24);
 447   const int r = AddSubtractComponentHalf((ave >> 16) & 0xff, (c2 >> 16) & 0xff);
 448   const int g = AddSubtractComponentHalf((ave >> 8) & 0xff, (c2 >> 8) & 0xff);
 449   const int b = AddSubtractComponentHalf((ave >> 0) & 0xff, (c2 >> 0) & 0xff);
 450   return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
 451 }
 452
 453 // gcc-4.9 on ARM generates incorrect code in Select() when Sub3() is inlined.
 454 #if defined(__arm__) && LOCAL_GCC_VERSION == 0x409
 455 # define LOCAL_INLINE __attribute__ ((noinline))
 456 #else
 457 # define LOCAL_INLINE WEBP_INLINE
 458 #endif
 459
 460 static LOCAL_INLINE int Sub3(int a, int b, int c) {
 461   const int pb = b - c;
 462   const int pa = a - c;
 463   return abs(pb) - abs(pa);
 464 }
 465
 466 #undef LOCAL_INLINE
 467
 468 static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
 469   const int pa_minus_pb =
 470       Sub3((a >> 24)       , (b >> 24)       , (c >> 24)       ) +
 471       Sub3((a >> 16) & 0xff, (b >> 16) & 0xff, (c >> 16) & 0xff) +
 472       Sub3((a >>  8) & 0xff, (b >>  8) & 0xff, (c >>  8) & 0xff) +
 473       Sub3((a      ) & 0xff, (b      ) & 0xff, (c      ) & 0xff);
 474   return (pa_minus_pb <= 0) ? a : b;
 475 }
 476
 477 //------------------------------------------------------------------------------
 478 // Predictors
 479
 480 static uint32_t Predictor0(uint32_t left, const uint32_t* const top) {
 481   (void)top;
 482   (void)left;
 483   return ARGB_BLACK;
 484 }
 485 static uint32_t Predictor1(uint32_t left, const uint32_t* const top) {
 486   (void)top;
 487   return left;
 488 }
 489 static uint32_t Predictor2(uint32_t left, const uint32_t* const top) {
 490   (void)left;
 491   return top[0];
 492 }
 493 static uint32_t Predictor3(uint32_t left, const uint32_t* const top) {
 494   (void)left;
 495   return top[1];
 496 }
 497 static uint32_t Predictor4(uint32_t left, const uint32_t* const top) {
 498   (void)left;
 499   return top[-1];
 500 }
 501 static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
 502   const uint32_t pred = Average3(left, top[0], top[1]);
 503   return pred;
 504 }
 505 static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
 506   const uint32_t pred = Average2(left, top[-1]);
 507   return pred;
 508 }
 509 static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
 510   const uint32_t pred = Average2(left, top[0]);
 511   return pred;
 512 }
 513 static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
 514   const uint32_t pred = Average2(top[-1], top[0]);
 515   (void)left;
 516   return pred;
 517 }
 518 static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
 519   const uint32_t pred = Average2(top[0], top[1]);
 520   (void)left;
 521   return pred;
 522 }
 523 static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
 524   const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
 525   return pred;
 526 }
 527 static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
 528   const uint32_t pred = Select(top[0], left, top[-1]);
 529   return pred;
 530 }
 531 static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
 532   const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
 533   return pred;
 534 }
 535 static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
 536   const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
 537   return pred;
 538 }
 539
 540 static const VP8LPredictorFunc kPredictorsC[16] = {
 541   Predictor0, Predictor1, Predictor2, Predictor3,
 542   Predictor4, Predictor5, Predictor6, Predictor7,
 543   Predictor8, Predictor9, Predictor10, Predictor11,
 544   Predictor12, Predictor13,
 545   Predictor0, Predictor0    // <- padding security sentinels
 546 };
 547
 548 static float PredictionCostSpatial(const int counts[256], int weight_0,
 549                                    double exp_val) {
 550   const int significant_symbols = 256 >> 4;
 551   const double exp_decay_factor = 0.6;
 552   double bits = weight_0 * counts[0];
 553   int i;
 554   for (i = 1; i < significant_symbols; ++i) {
 555     bits += exp_val * (counts[i] + counts[256 - i]);
 556     exp_val *= exp_decay_factor;
 557   }
 558   return (float)(-0.1 * bits);
 559 }
 560
 561 // Compute the combined Shanon's entropy for distribution {X} and {X+Y}
 562 static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
 563   int i;
 564   double retval = 0.;
 565   int sumX = 0, sumXY = 0;
 566   for (i = 0; i < 256; ++i) {
 567     const int x = X[i];
 568     const int xy = x + Y[i];
 569     if (x != 0) {
 570       sumX += x;
 571       retval -= VP8LFastSLog2(x);
 572       sumXY += xy;
 573       retval -= VP8LFastSLog2(xy);
 574     } else if (xy != 0) {
 575       sumXY += xy;
 576       retval -= VP8LFastSLog2(xy);
 577     }
 578   }
 579   retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
 580   return (float)retval;
 581 }
 582
 583 static float PredictionCostSpatialHistogram(const int accumulated[4][256],
 584                                             const int tile[4][256]) {
 585   int i;
 586   double retval = 0;
 587   for (i = 0; i < 4; ++i) {
 588     const double kExpValue = 0.94;
 589     retval += PredictionCostSpatial(tile[i], 1, kExpValue);
 590     retval += CombinedShannonEntropy(tile[i], accumulated[i]);
 591   }
 592   return (float)retval;
 593 }
 594
 595 static WEBP_INLINE void UpdateHisto(int histo_argb[4][256], uint32_t argb) {
 596   ++histo_argb[0][argb >> 24];
 597   ++histo_argb[1][(argb >> 16) & 0xff];
 598   ++histo_argb[2][(argb >> 8) & 0xff];
 599   ++histo_argb[3][argb & 0xff];
 600 }
 601
 602 static int GetBestPredictorForTile(int width, int height,
 603                                    int tile_x, int tile_y, int bits,
 604                                    const int accumulated[4][256],
 605                                    const uint32_t* const argb_scratch) {
 606   const int kNumPredModes = 14;
 607   const int col_start = tile_x << bits;
 608   const int row_start = tile_y << bits;
 609   const int tile_size = 1 << bits;
 610   const int max_y = GetMin(tile_size, height - row_start);
 611   const int max_x = GetMin(tile_size, width - col_start);
 612   float best_diff = MAX_DIFF_COST;
 613   int best_mode = 0;
 614   int mode;
 615   for (mode = 0; mode < kNumPredModes; ++mode) {
 616     const uint32_t* current_row = argb_scratch;
 617     const VP8LPredictorFunc pred_func = VP8LPredictors[mode];
 618     float cur_diff;
 619     int y;
 620     int histo_argb[4][256];
 621     memset(histo_argb, 0, sizeof(histo_argb));
 622     for (y = 0; y < max_y; ++y) {
 623       int x;
 624       const int row = row_start + y;
 625       const uint32_t* const upper_row = current_row;
 626       current_row = upper_row + width;
 627       for (x = 0; x < max_x; ++x) {
 628         const int col = col_start + x;
 629         uint32_t predict;
 630         if (row == 0) {
 631           predict = (col == 0) ? ARGB_BLACK : current_row[col - 1];  // Left.
 632         } else if (col == 0) {
 633           predict = upper_row[col];  // Top.
 634         } else {
 635           predict = pred_func(current_row[col - 1], upper_row + col);
 636         }
 637         UpdateHisto(histo_argb, VP8LSubPixels(current_row[col], predict));
 638       }
 639     }
 640     cur_diff = PredictionCostSpatialHistogram(
 641         accumulated, (const int (*)[256])histo_argb);
 642     if (cur_diff < best_diff) {
 643       best_diff = cur_diff;
 644       best_mode = mode;
 645     }
 646   }
 647
 648   return best_mode;
 649 }
 650
 651 static void CopyTileWithPrediction(int width, int height,
 652                                    int tile_x, int tile_y, int bits, int mode,
 653                                    const uint32_t* const argb_scratch,
 654                                    uint32_t* const argb) {
 655   const int col_start = tile_x << bits;
 656   const int row_start = tile_y << bits;
 657   const int tile_size = 1 << bits;
 658   const int max_y = GetMin(tile_size, height - row_start);
 659   const int max_x = GetMin(tile_size, width - col_start);
 660   const VP8LPredictorFunc pred_func = VP8LPredictors[mode];
 661   const uint32_t* current_row = argb_scratch;
 662
 663   int y;
 664   for (y = 0; y < max_y; ++y) {
 665     int x;
 666     const int row = row_start + y;
 667     const uint32_t* const upper_row = current_row;
 668     current_row = upper_row + width;
 669     for (x = 0; x < max_x; ++x) {
 670       const int col = col_start + x;
 671       const int pix = row * width + col;
 672       uint32_t predict;
 673       if (row == 0) {
 674         predict = (col == 0) ? ARGB_BLACK : current_row[col - 1];  // Left.
 675       } else if (col == 0) {
 676         predict = upper_row[col];  // Top.
 677       } else {
 678         predict = pred_func(current_row[col - 1], upper_row + col);
 679       }
 680       argb[pix] = VP8LSubPixels(current_row[col], predict);
 681     }
 682   }
 683 }
 684
 685 void VP8LResidualImage(int width, int height, int bits,
 686                        uint32_t* const argb, uint32_t* const argb_scratch,
 687                        uint32_t* const image) {
 688   const int max_tile_size = 1 << bits;
 689   const int tiles_per_row = VP8LSubSampleSize(width, bits);
 690   const int tiles_per_col = VP8LSubSampleSize(height, bits);
 691   uint32_t* const upper_row = argb_scratch;
 692   uint32_t* const current_tile_rows = argb_scratch + width;
 693   int tile_y;
 694   int histo[4][256];
 695   memset(histo, 0, sizeof(histo));
 696   for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
 697     const int tile_y_offset = tile_y * max_tile_size;
 698     const int this_tile_height =
 699         (tile_y < tiles_per_col - 1) ? max_tile_size : height - tile_y_offset;
 700     int tile_x;
 701     if (tile_y > 0) {
 702       memcpy(upper_row, current_tile_rows + (max_tile_size - 1) * width,
 703              width * sizeof(*upper_row));
 704     }
 705     memcpy(current_tile_rows, &argb[tile_y_offset * width],
 706            this_tile_height * width * sizeof(*current_tile_rows));
 707     for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
 708       int pred;
 709       int y;
 710       const int tile_x_offset = tile_x * max_tile_size;
 711       int all_x_max = tile_x_offset + max_tile_size;
 712       if (all_x_max > width) {
 713         all_x_max = width;
 714       }
 715       pred = GetBestPredictorForTile(width, height, tile_x, tile_y, bits,
 716                                      (const int (*)[256])histo,
 717                                      argb_scratch);
 718       image[tile_y * tiles_per_row + tile_x] = 0xff000000u | (pred << 8);
 719       CopyTileWithPrediction(width, height, tile_x, tile_y, bits, pred,
 720                              argb_scratch, argb);
 721       for (y = 0; y < max_tile_size; ++y) {
 722         int ix;
 723         int all_x;
 724         int all_y = tile_y_offset + y;
 725         if (all_y >= height) {
 726           break;
 727         }
 728         ix = all_y * width + tile_x_offset;
 729         for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
 730           UpdateHisto(histo, argb[ix]);
 731         }
 732       }
 733     }
 734   }
 735 }
 736
 737 // Inverse prediction.
 738 static void PredictorInverseTransform(const VP8LTransform* const transform,
 739                                       int y_start, int y_end, uint32_t* data) {
 740   const int width = transform->xsize_;
 741   if (y_start == 0) {  // First Row follows the L (mode=1) mode.
 742     int x;
 743     const uint32_t pred0 = Predictor0(data[-1], NULL);
 744     AddPixelsEq(data, pred0);
 745     for (x = 1; x < width; ++x) {
 746       const uint32_t pred1 = Predictor1(data[x - 1], NULL);
 747       AddPixelsEq(data + x, pred1);
 748     }
 749     data += width;
 750     ++y_start;
 751   }
 752
 753   {
 754     int y = y_start;
 755     const int tile_width = 1 << transform->bits_;
 756     const int mask = tile_width - 1;
 757     const int safe_width = width & ~mask;
 758     const int tiles_per_row = VP8LSubSampleSize(width, transform->bits_);
 759     const uint32_t* pred_mode_base =
 760         transform->data_ + (y >> transform->bits_) * tiles_per_row;
 761
 762     while (y < y_end) {
 763       const uint32_t pred2 = Predictor2(data[-1], data - width);
 764       const uint32_t* pred_mode_src = pred_mode_base;
 765       VP8LPredictorFunc pred_func;
 766       int x = 1;
 767       int t = 1;
 768       // First pixel follows the T (mode=2) mode.
 769       AddPixelsEq(data, pred2);
 770       // .. the rest:
 771       while (x < safe_width) {
 772         pred_func = VP8LPredictors[((*pred_mode_src++) >> 8) & 0xf];
 773         for (; t < tile_width; ++t, ++x) {
 774           const uint32_t pred = pred_func(data[x - 1], data + x - width);
 775           AddPixelsEq(data + x, pred);
 776         }
 777         t = 0;
 778       }
 779       if (x < width) {
 780         pred_func = VP8LPredictors[((*pred_mode_src++) >> 8) & 0xf];
 781         for (; x < width; ++x) {
 782           const uint32_t pred = pred_func(data[x - 1], data + x - width);
 783           AddPixelsEq(data + x, pred);
 784         }
 785       }
 786       data += width;
 787       ++y;
 788       if ((y & mask) == 0) {   // Use the same mask, since tiles are squares.
 789         pred_mode_base += tiles_per_row;
 790       }
 791     }
 792   }
 793 }
 794
 795 void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels) {
 796   int i;
 797   for (i = 0; i < num_pixels; ++i) {
 798     const uint32_t argb = argb_data[i];
 799     const uint32_t green = (argb >> 8) & 0xff;
 800     const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
 801     const uint32_t new_b = ((argb & 0xff) - green) & 0xff;
 802     argb_data[i] = (argb & 0xff00ff00) | (new_r << 16) | new_b;
 803   }
 804 }
 805
 806 // Add green to blue and red channels (i.e. perform the inverse transform of
 807 // 'subtract green').
 808 void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels) {
 809   int i;
 810   for (i = 0; i < num_pixels; ++i) {
 811     const uint32_t argb = data[i];
 812     const uint32_t green = ((argb >> 8) & 0xff);
 813     uint32_t red_blue = (argb & 0x00ff00ffu);
 814     red_blue += (green << 16) | green;
 815     red_blue &= 0x00ff00ffu;
 816     data[i] = (argb & 0xff00ff00u) | red_blue;
 817   }
 818 }
 819
 820 static WEBP_INLINE void MultipliersClear(VP8LMultipliers* const m) {
 821   m->green_to_red_ = 0;
 822   m->green_to_blue_ = 0;
 823   m->red_to_blue_ = 0;
 824 }
 825
 826 static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
 827                                                 int8_t color) {
 828   return (uint32_t)((int)(color_pred) * color) >> 5;
 829 }
 830
 831 static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
 832                                                VP8LMultipliers* const m) {
 833   m->green_to_red_  = (color_code >>  0) & 0xff;
 834   m->green_to_blue_ = (color_code >>  8) & 0xff;
 835   m->red_to_blue_   = (color_code >> 16) & 0xff;
 836 }
 837
 838 static WEBP_INLINE uint32_t MultipliersToColorCode(
 839     const VP8LMultipliers* const m) {
 840   return 0xff000000u |
 841          ((uint32_t)(m->red_to_blue_) << 16) |
 842          ((uint32_t)(m->green_to_blue_) << 8) |
 843          m->green_to_red_;
 844 }
 845
 846 void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data,
 847                           int num_pixels) {
 848   int i;
 849   for (i = 0; i < num_pixels; ++i) {
 850     const uint32_t argb = data[i];
 851     const uint32_t green = argb >> 8;
 852     const uint32_t red = argb >> 16;
 853     uint32_t new_red = red;
 854     uint32_t new_blue = argb;
 855     new_red -= ColorTransformDelta(m->green_to_red_, green);
 856     new_red &= 0xff;
 857     new_blue -= ColorTransformDelta(m->green_to_blue_, green);
 858     new_blue -= ColorTransformDelta(m->red_to_blue_, red);
 859     new_blue &= 0xff;
 860     data[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
 861   }
 862 }
 863
 864 void VP8LTransformColorInverse_C(const VP8LMultipliers* const m, uint32_t* data,
 865                                  int num_pixels) {
 866   int i;
 867   for (i = 0; i < num_pixels; ++i) {
 868     const uint32_t argb = data[i];
 869     const uint32_t green = argb >> 8;
 870     const uint32_t red = argb >> 16;
 871     uint32_t new_red = red;
 872     uint32_t new_blue = argb;
 873     new_red += ColorTransformDelta(m->green_to_red_, green);
 874     new_red &= 0xff;
 875     new_blue += ColorTransformDelta(m->green_to_blue_, green);
 876     new_blue += ColorTransformDelta(m->red_to_blue_, new_red);
 877     new_blue &= 0xff;
 878     data[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
 879   }
 880 }
 881
 882 static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
 883                                              uint32_t argb) {
 884   const uint32_t green = argb >> 8;
 885   uint32_t new_red = argb >> 16;
 886   new_red -= ColorTransformDelta(green_to_red, green);
 887   return (new_red & 0xff);
 888 }
 889
 890 static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
 891                                               uint8_t red_to_blue,
 892                                               uint32_t argb) {
 893   const uint32_t green = argb >> 8;
 894   const uint32_t red = argb >> 16;
 895   uint8_t new_blue = argb;
 896   new_blue -= ColorTransformDelta(green_to_blue, green);
 897   new_blue -= ColorTransformDelta(red_to_blue, red);
 898   return (new_blue & 0xff);
 899 }
 900
 901 static float PredictionCostCrossColor(const int accumulated[256],
 902                                       const int counts[256]) {
 903   // Favor low entropy, locally and globally.
 904   // Favor small absolute values for PredictionCostSpatial
 905   static const double kExpValue = 2.4;
 906   return CombinedShannonEntropy(counts, accumulated) +
 907          PredictionCostSpatial(counts, 3, kExpValue);
 908 }
 909
 910 static float GetPredictionCostCrossColorRed(
 911     int tile_x_offset, int tile_y_offset, int all_x_max, int all_y_max,
 912     int xsize, VP8LMultipliers prev_x, VP8LMultipliers prev_y, int green_to_red,
 913     const int accumulated_red_histo[256], const uint32_t* const argb) {
 914   int all_y;
 915   int histo[256] = { 0 };
 916   float cur_diff;
 917   for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) {
 918     int ix = all_y * xsize + tile_x_offset;
 919     int all_x;
 920     for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
 921       ++histo[TransformColorRed(green_to_red, argb[ix])];  // red.
 922     }
 923   }
 924   cur_diff = PredictionCostCrossColor(accumulated_red_histo, histo);
 925   if ((uint8_t)green_to_red == prev_x.green_to_red_) {
 926     cur_diff -= 3;  // favor keeping the areas locally similar
 927   }
 928   if ((uint8_t)green_to_red == prev_y.green_to_red_) {
 929     cur_diff -= 3;  // favor keeping the areas locally similar
 930   }
 931   if (green_to_red == 0) {
 932     cur_diff -= 3;
 933   }
 934   return cur_diff;
 935 }
 936
 937 static void GetBestGreenToRed(
 938     int tile_x_offset, int tile_y_offset, int all_x_max, int all_y_max,
 939     int xsize, VP8LMultipliers prev_x, VP8LMultipliers prev_y,
 940     const int accumulated_red_histo[256], const uint32_t* const argb,
 941     VP8LMultipliers* const best_tx) {
 942   int min_green_to_red = -64;
 943   int max_green_to_red = 64;
 944   int green_to_red = 0;
 945   int eval_min = 1;
 946   int eval_max = 1;
 947   float cur_diff_min = MAX_DIFF_COST;
 948   float cur_diff_max = MAX_DIFF_COST;
 949   // Do a binary search to find the optimal green_to_red color transform.
 950   while (max_green_to_red - min_green_to_red > 2) {
 951     if (eval_min) {
 952       cur_diff_min = GetPredictionCostCrossColorRed(
 953           tile_x_offset, tile_y_offset, all_x_max, all_y_max, xsize,
 954           prev_x, prev_y, min_green_to_red, accumulated_red_histo, argb);
 955       eval_min = 0;
 956     }
 957     if (eval_max) {
 958       cur_diff_max = GetPredictionCostCrossColorRed(
 959           tile_x_offset, tile_y_offset, all_x_max, all_y_max, xsize,
 960           prev_x, prev_y, max_green_to_red, accumulated_red_histo, argb);
 961       eval_max = 0;
 962     }
 963     if (cur_diff_min < cur_diff_max) {
 964       green_to_red = min_green_to_red;
 965       max_green_to_red = (max_green_to_red + min_green_to_red) / 2;
 966       eval_max = 1;
 967     } else {
 968       green_to_red = max_green_to_red;
 969       min_green_to_red = (max_green_to_red + min_green_to_red) / 2;
 970       eval_min = 1;
 971     }
 972   }
 973   best_tx->green_to_red_ = green_to_red;
 974 }
 975
 976 static float GetPredictionCostCrossColorBlue(
 977     int tile_x_offset, int tile_y_offset, int all_x_max, int all_y_max,
 978     int xsize, VP8LMultipliers prev_x, VP8LMultipliers prev_y,
 979     int green_to_blue, int red_to_blue, const int accumulated_blue_histo[256],
 980     const uint32_t* const argb) {
 981   int all_y;
 982   int histo[256] = { 0 };
 983   float cur_diff;
 984   for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) {
 985     int all_x;
 986     int ix = all_y * xsize + tile_x_offset;
 987     for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
 988       ++histo[TransformColorBlue(green_to_blue, red_to_blue, argb[ix])];
 989     }
 990   }
 991   cur_diff = PredictionCostCrossColor(accumulated_blue_histo, histo);
 992   if ((uint8_t)green_to_blue == prev_x.green_to_blue_) {
 993     cur_diff -= 3;  // favor keeping the areas locally similar
 994   }
 995   if ((uint8_t)green_to_blue == prev_y.green_to_blue_) {
 996     cur_diff -= 3;  // favor keeping the areas locally similar
 997   }
 998   if ((uint8_t)red_to_blue == prev_x.red_to_blue_) {
 999     cur_diff -= 3;  // favor keeping the areas locally similar
1000   }
1001   if ((uint8_t)red_to_blue == prev_y.red_to_blue_) {
1002     cur_diff -= 3;  // favor keeping the areas locally similar
1003   }
1004   if (green_to_blue == 0) {
1005     cur_diff -= 3;
1006   }
1007   if (red_to_blue == 0) {
1008     cur_diff -= 3;
1009   }
1010   return cur_diff;
1011 }
1012
1013 static void GetBestGreenRedToBlue(
1014     int tile_x_offset, int tile_y_offset, int all_x_max, int all_y_max,
1015     int xsize, VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality,
1016     const int accumulated_blue_histo[256], const uint32_t* const argb,
1017     VP8LMultipliers* const best_tx) {
1018   float best_diff = MAX_DIFF_COST;
1019   float cur_diff;
1020   const int step = (quality < 25) ? 32 : (quality > 50) ? 8 : 16;
1021   const int min_green_to_blue = -32;
1022   const int max_green_to_blue = 32;
1023   const int min_red_to_blue = -32;
1024   const int max_red_to_blue = 32;
1025   const int num_iters =
1026       (1 + (max_green_to_blue - min_green_to_blue) / step) *
1027       (1 + (max_red_to_blue - min_red_to_blue) / step);
1028   // Number of tries to get optimal green_to_blue & red_to_blue color transforms
1029   // after finding a local minima.
1030   const int max_tries_after_min = 4 + (num_iters >> 2);
1031   int num_tries_after_min = 0;
1032   int green_to_blue;
1033   for (green_to_blue = min_green_to_blue;
1034        green_to_blue <= max_green_to_blue &&
1035        num_tries_after_min < max_tries_after_min;
1036        green_to_blue += step) {
1037     int red_to_blue;
1038     for (red_to_blue = min_red_to_blue;
1039          red_to_blue <= max_red_to_blue &&
1040          num_tries_after_min < max_tries_after_min;
1041          red_to_blue += step) {
1042       cur_diff = GetPredictionCostCrossColorBlue(
1043           tile_x_offset, tile_y_offset, all_x_max, all_y_max, xsize, prev_x,
1044           prev_y, green_to_blue, red_to_blue, accumulated_blue_histo, argb);
1045       if (cur_diff < best_diff) {
1046         best_diff = cur_diff;
1047         best_tx->green_to_blue_ = green_to_blue;
1048         best_tx->red_to_blue_ = red_to_blue;
1049         num_tries_after_min = 0;
1050       } else {
1051         ++num_tries_after_min;
1052       }
1053     }
1054   }
1055 }
1056
1057 static VP8LMultipliers GetBestColorTransformForTile(
1058     int tile_x, int tile_y, int bits,
1059     VP8LMultipliers prev_x,
1060     VP8LMultipliers prev_y,
1061     int quality, int xsize, int ysize,
1062     const int accumulated_red_histo[256],
1063     const int accumulated_blue_histo[256],
1064     const uint32_t* const argb) {
1065   const int max_tile_size = 1 << bits;
1066   const int tile_y_offset = tile_y * max_tile_size;
1067   const int tile_x_offset = tile_x * max_tile_size;
1068   const int all_x_max = GetMin(tile_x_offset + max_tile_size, xsize);
1069   const int all_y_max = GetMin(tile_y_offset + max_tile_size, ysize);
1070   VP8LMultipliers best_tx;
1071   MultipliersClear(&best_tx);
1072
1073   GetBestGreenToRed(tile_x_offset, tile_y_offset, all_x_max, all_y_max, xsize,
1074                     prev_x, prev_y, accumulated_red_histo, argb, &best_tx);
1075   GetBestGreenRedToBlue(tile_x_offset, tile_y_offset, all_x_max, all_y_max,
1076                         xsize, prev_x, prev_y, quality, accumulated_blue_histo,
1077                         argb, &best_tx);
1078   return best_tx;
1079 }
1080
1081 static void CopyTileWithColorTransform(int xsize, int ysize,
1082                                        int tile_x, int tile_y,
1083                                        int max_tile_size,
1084                                        VP8LMultipliers color_transform,
1085                                        uint32_t* argb) {
1086   const int xscan = GetMin(max_tile_size, xsize - tile_x);
1087   int yscan = GetMin(max_tile_size, ysize - tile_y);
1088   argb += tile_y * xsize + tile_x;
1089   while (yscan-- > 0) {
1090     VP8LTransformColor(&color_transform, argb, xscan);
1091     argb += xsize;
1092   }
1093 }
1094
1095 void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
1096                              uint32_t* const argb, uint32_t* image) {
1097   const int max_tile_size = 1 << bits;
1098   const int tile_xsize = VP8LSubSampleSize(width, bits);
1099   const int tile_ysize = VP8LSubSampleSize(height, bits);
1100   int accumulated_red_histo[256] = { 0 };
1101   int accumulated_blue_histo[256] = { 0 };
1102   int tile_x, tile_y;
1103   VP8LMultipliers prev_x, prev_y;
1104   MultipliersClear(&prev_y);
1105   MultipliersClear(&prev_x);
1106   for (tile_y = 0; tile_y < tile_ysize; ++tile_y) {
1107     for (tile_x = 0; tile_x < tile_xsize; ++tile_x) {
1108       int y;
1109       const int tile_x_offset = tile_x * max_tile_size;
1110       const int tile_y_offset = tile_y * max_tile_size;
1111       const int all_x_max = GetMin(tile_x_offset + max_tile_size, width);
1112       const int all_y_max = GetMin(tile_y_offset + max_tile_size, height);
1113       const int offset = tile_y * tile_xsize + tile_x;
1114       if (tile_y != 0) {
1115         ColorCodeToMultipliers(image[offset - tile_xsize], &prev_y);
1116       }
1117       prev_x = GetBestColorTransformForTile(tile_x, tile_y, bits,
1118                                             prev_x, prev_y,
1119                                             quality, width, height,
1120                                             accumulated_red_histo,
1121                                             accumulated_blue_histo,
1122                                             argb);
1123       image[offset] = MultipliersToColorCode(&prev_x);
1124       CopyTileWithColorTransform(width, height, tile_x_offset, tile_y_offset,
1125                                  max_tile_size, prev_x, argb);
1126
1127       // Gather accumulated histogram data.
1128       for (y = tile_y_offset; y < all_y_max; ++y) {
1129         int ix = y * width + tile_x_offset;
1130         const int ix_end = ix + all_x_max - tile_x_offset;
1131         for (; ix < ix_end; ++ix) {
1132           const uint32_t pix = argb[ix];
1133           if (ix >= 2 &&
1134               pix == argb[ix - 2] &&
1135               pix == argb[ix - 1]) {
1136             continue;  // repeated pixels are handled by backward references
1137           }
1138           if (ix >= width + 2 &&
1139               argb[ix - 2] == argb[ix - width - 2] &&
1140               argb[ix - 1] == argb[ix - width - 1] &&
1141               pix == argb[ix - width]) {
1142             continue;  // repeated pixels are handled by backward references
1143           }
1144           ++accumulated_red_histo[(pix >> 16) & 0xff];
1145           ++accumulated_blue_histo[(pix >> 0) & 0xff];
1146         }
1147       }
1148     }
1149   }
1150 }
1151
1152 // Color space inverse transform.
1153 static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
1154                                        int y_start, int y_end, uint32_t* data) {
1155   const int width = transform->xsize_;
1156   const int tile_width = 1 << transform->bits_;
1157   const int mask = tile_width - 1;
1158   const int safe_width = width & ~mask;
1159   const int remaining_width = width - safe_width;
1160   const int tiles_per_row = VP8LSubSampleSize(width, transform->bits_);
1161   int y = y_start;
1162   const uint32_t* pred_row =
1163       transform->data_ + (y >> transform->bits_) * tiles_per_row;
1164
1165   while (y < y_end) {
1166     const uint32_t* pred = pred_row;
1167     VP8LMultipliers m = { 0, 0, 0 };
1168     const uint32_t* const data_safe_end = data + safe_width;
1169     const uint32_t* const data_end = data + width;
1170     while (data < data_safe_end) {
1171       ColorCodeToMultipliers(*pred++, &m);
1172       VP8LTransformColorInverse(&m, data, tile_width);
1173       data += tile_width;
1174     }
1175     if (data < data_end) {  // Left-overs using C-version.
1176       ColorCodeToMultipliers(*pred++, &m);
1177       VP8LTransformColorInverse(&m, data, remaining_width);
1178       data += remaining_width;
1179     }
1180     ++y;
1181     if ((y & mask) == 0) pred_row += tiles_per_row;
1182   }
1183 }
1184
1185 // Separate out pixels packed together using pixel-bundling.
1186 // We define two methods for ARGB data (uint32_t) and alpha-only data (uint8_t).
1187 #define COLOR_INDEX_INVERSE(FUNC_NAME, TYPE, GET_INDEX, GET_VALUE)             \
1188 void FUNC_NAME(const VP8LTransform* const transform,                           \
1189                int y_start, int y_end, const TYPE* src, TYPE* dst) {           \
1190   int y;                                                                       \
1191   const int bits_per_pixel = 8 >> transform->bits_;                            \
1192   const int width = transform->xsize_;                                         \
1193   const uint32_t* const color_map = transform->data_;                          \
1194   if (bits_per_pixel < 8) {                                                    \
1195     const int pixels_per_byte = 1 << transform->bits_;                         \
1196     const int count_mask = pixels_per_byte - 1;                                \
1197     const uint32_t bit_mask = (1 << bits_per_pixel) - 1;                       \
1198     for (y = y_start; y < y_end; ++y) {                                        \
1199       uint32_t packed_pixels = 0;                                              \
1200       int x;                                                                   \
1201       for (x = 0; x < width; ++x) {                                            \
1202         /* We need to load fresh 'packed_pixels' once every                */  \
1203         /* 'pixels_per_byte' increments of x. Fortunately, pixels_per_byte */  \
1204         /* is a power of 2, so can just use a mask for that, instead of    */  \
1205         /* decrementing a counter.                                         */  \
1206         if ((x & count_mask) == 0) packed_pixels = GET_INDEX(*src++);          \
1207         *dst++ = GET_VALUE(color_map[packed_pixels & bit_mask]);               \
1208         packed_pixels >>= bits_per_pixel;                                      \
1209       }                                                                        \
1210     }                                                                          \
1211   } else {                                                                     \
1212     for (y = y_start; y < y_end; ++y) {                                        \
1213       int x;                                                                   \
1214       for (x = 0; x < width; ++x) {                                            \
1215         *dst++ = GET_VALUE(color_map[GET_INDEX(*src++)]);                      \
1216       }                                                                        \
1217     }                                                                          \
1218   }                                                                            \
1219 }
1220
1221 static WEBP_INLINE uint32_t GetARGBIndex(uint32_t idx) {
1222   return (idx >> 8) & 0xff;
1223 }
1224
1225 static WEBP_INLINE uint8_t GetAlphaIndex(uint8_t idx) {
1226   return idx;
1227 }
1228
1229 static WEBP_INLINE uint32_t GetARGBValue(uint32_t val) {
1230   return val;
1231 }
1232
1233 static WEBP_INLINE uint8_t GetAlphaValue(uint32_t val) {
1234   return (val >> 8) & 0xff;
1235 }
1236
1237 static COLOR_INDEX_INVERSE(ColorIndexInverseTransform, uint32_t, GetARGBIndex,
1238                            GetARGBValue)
1239 COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, uint8_t, GetAlphaIndex,
1240                     GetAlphaValue)
1241
1242 #undef COLOR_INDEX_INVERSE
1243
1244 void VP8LInverseTransform(const VP8LTransform* const transform,
1245                           int row_start, int row_end,
1246                           const uint32_t* const in, uint32_t* const out) {
1247   const int width = transform->xsize_;
1248   assert(row_start < row_end);
1249   assert(row_end <= transform->ysize_);
1250   switch (transform->type_) {
1251     case SUBTRACT_GREEN:
1252       VP8LAddGreenToBlueAndRed(out, (row_end - row_start) * width);
1253       break;
1254     case PREDICTOR_TRANSFORM:
1255       PredictorInverseTransform(transform, row_start, row_end, out);
1256       if (row_end != transform->ysize_) {
1257         // The last predicted row in this iteration will be the top-pred row
1258         // for the first row in next iteration.
1259         memcpy(out - width, out + (row_end - row_start - 1) * width,
1260                width * sizeof(*out));
1261       }
1262       break;
1263     case CROSS_COLOR_TRANSFORM:
1264       ColorSpaceInverseTransform(transform, row_start, row_end, out);
1265       break;
1266     case COLOR_INDEXING_TRANSFORM:
1267       if (in == out && transform->bits_ > 0) {
1268         // Move packed pixels to the end of unpacked region, so that unpacking
1269         // can occur seamlessly.
1270         // Also, note that this is the only transform that applies on
1271         // the effective width of VP8LSubSampleSize(xsize_, bits_). All other
1272         // transforms work on effective width of xsize_.
1273         const int out_stride = (row_end - row_start) * width;
1274         const int in_stride = (row_end - row_start) *
1275             VP8LSubSampleSize(transform->xsize_, transform->bits_);
1276         uint32_t* const src = out + out_stride - in_stride;
1277         memmove(src, out, in_stride * sizeof(*src));
1278         ColorIndexInverseTransform(transform, row_start, row_end, src, out);
1279       } else {
1280         ColorIndexInverseTransform(transform, row_start, row_end, in, out);
1281       }
1282       break;
1283   }
1284 }
1285
1286 //------------------------------------------------------------------------------
1287 // Color space conversion.
1288
1289 static int is_big_endian(void) {
1290   static const union {
1291     uint16_t w;
1292     uint8_t b[2];
1293   } tmp = { 1 };
1294   return (tmp.b[0] != 1);
1295 }
1296
1297 void VP8LConvertBGRAToRGB_C(const uint32_t* src,
1298                             int num_pixels, uint8_t* dst) {
1299   const uint32_t* const src_end = src + num_pixels;
1300   while (src < src_end) {
1301     const uint32_t argb = *src++;
1302     *dst++ = (argb >> 16) & 0xff;
1303     *dst++ = (argb >>  8) & 0xff;
1304     *dst++ = (argb >>  0) & 0xff;
1305   }
1306 }
1307
1308 void VP8LConvertBGRAToRGBA_C(const uint32_t* src,
1309                              int num_pixels, uint8_t* dst) {
1310   const uint32_t* const src_end = src + num_pixels;
1311   while (src < src_end) {
1312     const uint32_t argb = *src++;
1313     *dst++ = (argb >> 16) & 0xff;
1314     *dst++ = (argb >>  8) & 0xff;
1315     *dst++ = (argb >>  0) & 0xff;
1316     *dst++ = (argb >> 24) & 0xff;
1317   }
1318 }
1319
1320 void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
1321                                  int num_pixels, uint8_t* dst) {
1322   const uint32_t* const src_end = src + num_pixels;
1323   while (src < src_end) {
1324     const uint32_t argb = *src++;
1325     const uint8_t rg = ((argb >> 16) & 0xf0) | ((argb >> 12) & 0xf);
1326     const uint8_t ba = ((argb >>  0) & 0xf0) | ((argb >> 28) & 0xf);
1327 #ifdef WEBP_SWAP_16BIT_CSP
1328     *dst++ = ba;
1329     *dst++ = rg;
1330 #else
1331     *dst++ = rg;
1332     *dst++ = ba;
1333 #endif
1334   }
1335 }
1336
1337 void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
1338                                int num_pixels, uint8_t* dst) {
1339   const uint32_t* const src_end = src + num_pixels;
1340   while (src < src_end) {
1341     const uint32_t argb = *src++;
1342     const uint8_t rg = ((argb >> 16) & 0xf8) | ((argb >> 13) & 0x7);
1343     const uint8_t gb = ((argb >>  5) & 0xe0) | ((argb >>  3) & 0x1f);
1344 #ifdef WEBP_SWAP_16BIT_CSP
1345     *dst++ = gb;
1346     *dst++ = rg;
1347 #else
1348     *dst++ = rg;
1349     *dst++ = gb;
1350 #endif
1351   }
1352 }
1353
1354 void VP8LConvertBGRAToBGR_C(const uint32_t* src,
1355                             int num_pixels, uint8_t* dst) {
1356   const uint32_t* const src_end = src + num_pixels;
1357   while (src < src_end) {
1358     const uint32_t argb = *src++;
1359     *dst++ = (argb >>  0) & 0xff;
1360     *dst++ = (argb >>  8) & 0xff;
1361     *dst++ = (argb >> 16) & 0xff;
1362   }
1363 }
1364
1365 static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst,
1366                        int swap_on_big_endian) {
1367   if (is_big_endian() == swap_on_big_endian) {
1368     const uint32_t* const src_end = src + num_pixels;
1369     while (src < src_end) {
1370       const uint32_t argb = *src++;
1371
1372 #if !defined(WORDS_BIGENDIAN)
1373 #if !defined(WEBP_REFERENCE_IMPLEMENTATION)
1374       *(uint32_t*)dst = BSwap32(argb);
1375 #else  // WEBP_REFERENCE_IMPLEMENTATION
1376       dst[0] = (argb >> 24) & 0xff;
1377       dst[1] = (argb >> 16) & 0xff;
1378       dst[2] = (argb >>  8) & 0xff;
1379       dst[3] = (argb >>  0) & 0xff;
1380 #endif
1381 #else  // WORDS_BIGENDIAN
1382       dst[0] = (argb >>  0) & 0xff;
1383       dst[1] = (argb >>  8) & 0xff;
1384       dst[2] = (argb >> 16) & 0xff;
1385       dst[3] = (argb >> 24) & 0xff;
1386 #endif
1387       dst += sizeof(argb);
1388     }
1389   } else {
1390     memcpy(dst, src, num_pixels * sizeof(*src));
1391   }
1392 }
1393
1394 void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
1395                          WEBP_CSP_MODE out_colorspace, uint8_t* const rgba) {
1396   switch (out_colorspace) {
1397     case MODE_RGB:
1398       VP8LConvertBGRAToRGB(in_data, num_pixels, rgba);
1399       break;
1400     case MODE_RGBA:
1401       VP8LConvertBGRAToRGBA(in_data, num_pixels, rgba);
1402       break;
1403     case MODE_rgbA:
1404       VP8LConvertBGRAToRGBA(in_data, num_pixels, rgba);
1405       WebPApplyAlphaMultiply(rgba, 0, num_pixels, 1, 0);
1406       break;
1407     case MODE_BGR:
1408       VP8LConvertBGRAToBGR(in_data, num_pixels, rgba);
1409       break;
1410     case MODE_BGRA:
1411       CopyOrSwap(in_data, num_pixels, rgba, 1);
1412       break;
1413     case MODE_bgrA:
1414       CopyOrSwap(in_data, num_pixels, rgba, 1);
1415       WebPApplyAlphaMultiply(rgba, 0, num_pixels, 1, 0);
1416       break;
1417     case MODE_ARGB:
1418       CopyOrSwap(in_data, num_pixels, rgba, 0);
1419       break;
1420     case MODE_Argb:
1421       CopyOrSwap(in_data, num_pixels, rgba, 0);
1422       WebPApplyAlphaMultiply(rgba, 1, num_pixels, 1, 0);
1423       break;
1424     case MODE_RGBA_4444:
1425       VP8LConvertBGRAToRGBA4444(in_data, num_pixels, rgba);
1426       break;
1427     case MODE_rgbA_4444:
1428       VP8LConvertBGRAToRGBA4444(in_data, num_pixels, rgba);
1429       WebPApplyAlphaMultiply4444(rgba, num_pixels, 1, 0);
1430       break;
1431     case MODE_RGB_565:
1432       VP8LConvertBGRAToRGB565(in_data, num_pixels, rgba);
1433       break;
1434     default:
1435       assert(0);          // Code flow should not reach here.
1436   }
1437 }
1438
1439 //------------------------------------------------------------------------------
1440 // Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
1441 void VP8LBundleColorMap(const uint8_t* const row, int width,
1442                         int xbits, uint32_t* const dst) {
1443   int x;
1444   if (xbits > 0) {
1445     const int bit_depth = 1 << (3 - xbits);
1446     const int mask = (1 << xbits) - 1;
1447     uint32_t code = 0xff000000;
1448     for (x = 0; x < width; ++x) {
1449       const int xsub = x & mask;
1450       if (xsub == 0) {
1451         code = 0xff000000;
1452       }
1453       code |= row[x] << (8 + bit_depth * xsub);
1454       dst[x >> xbits] = code;
1455     }
1456   } else {
1457     for (x = 0; x < width; ++x) dst[x] = 0xff000000 | (row[x] << 8);
1458   }
1459 }
1460
1461 //------------------------------------------------------------------------------
1462
1463 static double ExtraCost(const uint32_t* population, int length) {
1464   int i;
1465   double cost = 0.;
1466   for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2];
1467   return cost;
1468 }
1469
1470 static double ExtraCostCombined(const uint32_t* X, const uint32_t* Y,
1471                                 int length) {
1472   int i;
1473   double cost = 0.;
1474   for (i = 2; i < length - 2; ++i) {
1475     const int xy = X[i + 2] + Y[i + 2];
1476     cost += (i >> 1) * xy;
1477   }
1478   return cost;
1479 }
1480
1481 // Returns the various RLE counts
1482 static VP8LStreaks HuffmanCostCount(const uint32_t* population, int length) {
1483   int i;
1484   int streak = 0;
1485   VP8LStreaks stats;
1486   memset(&stats, 0, sizeof(stats));
1487   for (i = 0; i < length - 1; ++i) {
1488     ++streak;
1489     if (population[i] == population[i + 1]) {
1490       continue;
1491     }
1492     stats.counts[population[i] != 0] += (streak > 3);
1493     stats.streaks[population[i] != 0][(streak > 3)] += streak;
1494     streak = 0;
1495   }
1496   ++streak;
1497   stats.counts[population[i] != 0] += (streak > 3);
1498   stats.streaks[population[i] != 0][(streak > 3)] += streak;
1499   return stats;
1500 }
1501
1502 static VP8LStreaks HuffmanCostCombinedCount(const uint32_t* X,
1503                                             const uint32_t* Y, int length) {
1504   int i;
1505   int streak = 0;
1506   VP8LStreaks stats;
1507   memset(&stats, 0, sizeof(stats));
1508   for (i = 0; i < length - 1; ++i) {
1509     const int xy = X[i] + Y[i];
1510     const int xy_next = X[i + 1] + Y[i + 1];
1511     ++streak;
1512     if (xy == xy_next) {
1513       continue;
1514     }
1515     stats.counts[xy != 0] += (streak > 3);
1516     stats.streaks[xy != 0][(streak > 3)] += streak;
1517     streak = 0;
1518   }
1519   {
1520     const int xy = X[i] + Y[i];
1521     ++streak;
1522     stats.counts[xy != 0] += (streak > 3);
1523     stats.streaks[xy != 0][(streak > 3)] += streak;
1524   }
1525   return stats;
1526 }
1527
1528 //------------------------------------------------------------------------------
1529
1530 static void HistogramAdd(const VP8LHistogram* const a,
1531                          const VP8LHistogram* const b,
1532                          VP8LHistogram* const out) {
1533   int i;
1534   const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
1535   assert(a->palette_code_bits_ == b->palette_code_bits_);
1536   if (b != out) {
1537     for (i = 0; i < literal_size; ++i) {
1538       out->literal_[i] = a->literal_[i] + b->literal_[i];
1539     }
1540     for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
1541       out->distance_[i] = a->distance_[i] + b->distance_[i];
1542     }
1543     for (i = 0; i < NUM_LITERAL_CODES; ++i) {
1544       out->red_[i] = a->red_[i] + b->red_[i];
1545       out->blue_[i] = a->blue_[i] + b->blue_[i];
1546       out->alpha_[i] = a->alpha_[i] + b->alpha_[i];
1547     }
1548   } else {
1549     for (i = 0; i < literal_size; ++i) {
1550       out->literal_[i] += a->literal_[i];
1551     }
1552     for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
1553       out->distance_[i] += a->distance_[i];
1554     }
1555     for (i = 0; i < NUM_LITERAL_CODES; ++i) {
1556       out->red_[i] += a->red_[i];
1557       out->blue_[i] += a->blue_[i];
1558       out->alpha_[i] += a->alpha_[i];
1559     }
1560   }
1561 }
1562
1563 //------------------------------------------------------------------------------
1564
1565 VP8LProcessBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
1566 VP8LProcessBlueAndRedFunc VP8LAddGreenToBlueAndRed;
1567 VP8LPredictorFunc VP8LPredictors[16];
1568
1569 VP8LTransformColorFunc VP8LTransformColor;
1570 VP8LTransformColorFunc VP8LTransformColorInverse;
1571
1572 VP8LConvertFunc VP8LConvertBGRAToRGB;
1573 VP8LConvertFunc VP8LConvertBGRAToRGBA;
1574 VP8LConvertFunc VP8LConvertBGRAToRGBA4444;
1575 VP8LConvertFunc VP8LConvertBGRAToRGB565;
1576 VP8LConvertFunc VP8LConvertBGRAToBGR;
1577
1578 VP8LFastLog2SlowFunc VP8LFastLog2Slow;
1579 VP8LFastLog2SlowFunc VP8LFastSLog2Slow;
1580
1581 VP8LCostFunc VP8LExtraCost;
1582 VP8LCostCombinedFunc VP8LExtraCostCombined;
1583
1584 VP8LCostCountFunc VP8LHuffmanCostCount;
1585 VP8LCostCombinedCountFunc VP8LHuffmanCostCombinedCount;
1586
1587 VP8LHistogramAddFunc VP8LHistogramAdd;
1588
1589 extern void VP8LDspInitSSE2(void);
1590 extern void VP8LDspInitNEON(void);
1591 extern void VP8LDspInitMIPS32(void);
1592
1593 void VP8LDspInit(void) {
1594   memcpy(VP8LPredictors, kPredictorsC, sizeof(VP8LPredictors));
1595
1596   VP8LSubtractGreenFromBlueAndRed = VP8LSubtractGreenFromBlueAndRed_C;
1597   VP8LAddGreenToBlueAndRed = VP8LAddGreenToBlueAndRed_C;
1598
1599   VP8LTransformColor = VP8LTransformColor_C;
1600   VP8LTransformColorInverse = VP8LTransformColorInverse_C;
1601
1602   VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C;
1603   VP8LConvertBGRAToRGBA = VP8LConvertBGRAToRGBA_C;
1604   VP8LConvertBGRAToRGBA4444 = VP8LConvertBGRAToRGBA4444_C;
1605   VP8LConvertBGRAToRGB565 = VP8LConvertBGRAToRGB565_C;
1606   VP8LConvertBGRAToBGR = VP8LConvertBGRAToBGR_C;
1607
1608   VP8LFastLog2Slow = FastLog2Slow;
1609   VP8LFastSLog2Slow = FastSLog2Slow;
1610
1611   VP8LExtraCost = ExtraCost;
1612   VP8LExtraCostCombined = ExtraCostCombined;
1613
1614   VP8LHuffmanCostCount = HuffmanCostCount;
1615   VP8LHuffmanCostCombinedCount = HuffmanCostCombinedCount;
1616
1617   VP8LHistogramAdd = HistogramAdd;
1618
1619   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
1620   if (VP8GetCPUInfo != NULL) {
1621 #if defined(WEBP_USE_SSE2)
1622     if (VP8GetCPUInfo(kSSE2)) {
1623       VP8LDspInitSSE2();
1624     }
1625 #endif
1626 #if defined(WEBP_USE_NEON)
1627     if (VP8GetCPUInfo(kNEON)) {
1628       VP8LDspInitNEON();
1629     }
1630 #endif
1631 #if defined(WEBP_USE_MIPS32)
1632     if (VP8GetCPUInfo(kMIPS32)) {
1633       VP8LDspInitMIPS32();
1634     }
1635 #endif
1636   }
1637 }
1638
1639 //------------------------------------------------------------------------------