src/third_party/libyuv/source/row_common.cc

   1 /*
   2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS. All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include "libyuv/row.h"
  12
  13 #include <string.h>  // For memcpy and memset.
  14
  15 #include "libyuv/basic_types.h"
  16
  17 #ifdef __cplusplus
  18 namespace libyuv {
  19 extern "C" {
  20 #endif
  21
  22 // llvm x86 is poor at ternary operator, so use branchless min/max.
  23
  24 #define USE_BRANCHLESS 1
  25 #if USE_BRANCHLESS
  26 static __inline int32 clamp0(int32 v) {
  27   return ((-(v) >> 31) & (v));
  28 }
  29
  30 static __inline int32 clamp255(int32 v) {
  31   return (((255 - (v)) >> 31) | (v)) & 255;
  32 }
  33
  34 static __inline uint32 Clamp(int32 val) {
  35   int v = clamp0(val);
  36   return static_cast<uint32>(clamp255(v));
  37 }
  38
  39 static __inline uint32 Abs(int32 v) {
  40   int m = v >> 31;
  41   return (v + m) ^ m;
  42 }
  43 #else  // USE_BRANCHLESS
  44 static __inline int32 clamp0(int32 v) {
  45   return (v < 0) ? 0 : v;
  46 }
  47
  48 static __inline int32 clamp255(int32 v) {
  49   return (v > 255) ? 255 : v;
  50 }
  51
  52 static __inline uint32 Clamp(int32 val) {
  53   int v = clamp0(val);
  54   return static_cast<uint32>(clamp255(v));
  55 }
  56
  57 static __inline uint32 Abs(int32 v) {
  58   return (v < 0) ? -v : v;
  59 }
  60 #endif  // USE_BRANCHLESS
  61
  62 #ifdef LIBYUV_LITTLE_ENDIAN
  63 #define WRITEWORD(p, v) *reinterpret_cast<uint32*>(p) = v
  64 #else
  65 static inline void WRITEWORD(uint8* p, uint32 v) {
  66   p[0] = (uint8)(v & 255);
  67   p[1] = (uint8)((v >> 8) & 255);
  68   p[2] = (uint8)((v >> 16) & 255);
  69   p[3] = (uint8)((v >> 24) & 255);
  70 }
  71 #endif
  72
  73 void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
  74   for (int x = 0; x < width; ++x) {
  75     uint8 b = src_rgb24[0];
  76     uint8 g = src_rgb24[1];
  77     uint8 r = src_rgb24[2];
  78     dst_argb[0] = b;
  79     dst_argb[1] = g;
  80     dst_argb[2] = r;
  81     dst_argb[3] = 255u;
  82     dst_argb += 4;
  83     src_rgb24 += 3;
  84   }
  85 }
  86
  87 void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
  88   for (int x = 0; x < width; ++x) {
  89     uint8 r = src_raw[0];
  90     uint8 g = src_raw[1];
  91     uint8 b = src_raw[2];
  92     dst_argb[0] = b;
  93     dst_argb[1] = g;
  94     dst_argb[2] = r;
  95     dst_argb[3] = 255u;
  96     dst_argb += 4;
  97     src_raw += 3;
  98   }
  99 }
 100
 101 void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {
 102   for (int x = 0; x < width; ++x) {
 103     uint8 b = src_rgb565[0] & 0x1f;
 104     uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
 105     uint8 r = src_rgb565[1] >> 3;
 106     dst_argb[0] = (b << 3) | (b >> 2);
 107     dst_argb[1] = (g << 2) | (g >> 4);
 108     dst_argb[2] = (r << 3) | (r >> 2);
 109     dst_argb[3] = 255u;
 110     dst_argb += 4;
 111     src_rgb565 += 2;
 112   }
 113 }
 114
 115 void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb,
 116                          int width) {
 117   for (int x = 0; x < width; ++x) {
 118     uint8 b = src_argb1555[0] & 0x1f;
 119     uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
 120     uint8 r = (src_argb1555[1] & 0x7c) >> 2;
 121     uint8 a = src_argb1555[1] >> 7;
 122     dst_argb[0] = (b << 3) | (b >> 2);
 123     dst_argb[1] = (g << 3) | (g >> 2);
 124     dst_argb[2] = (r << 3) | (r >> 2);
 125     dst_argb[3] = -a;
 126     dst_argb += 4;
 127     src_argb1555 += 2;
 128   }
 129 }
 130
 131 void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb,
 132                          int width) {
 133   for (int x = 0; x < width; ++x) {
 134     uint8 b = src_argb4444[0] & 0x0f;
 135     uint8 g = src_argb4444[0] >> 4;
 136     uint8 r = src_argb4444[1] & 0x0f;
 137     uint8 a = src_argb4444[1] >> 4;
 138     dst_argb[0] = (b << 4) | b;
 139     dst_argb[1] = (g << 4) | g;
 140     dst_argb[2] = (r << 4) | r;
 141     dst_argb[3] = (a << 4) | a;
 142     dst_argb += 4;
 143     src_argb4444 += 2;
 144   }
 145 }
 146
 147 void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
 148   for (int x = 0; x < width; ++x) {
 149     uint8 b = src_argb[0];
 150     uint8 g = src_argb[1];
 151     uint8 r = src_argb[2];
 152     dst_rgb[0] = b;
 153     dst_rgb[1] = g;
 154     dst_rgb[2] = r;
 155     dst_rgb += 3;
 156     src_argb += 4;
 157   }
 158 }
 159
 160 void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
 161   for (int x = 0; x < width; ++x) {
 162     uint8 b = src_argb[0];
 163     uint8 g = src_argb[1];
 164     uint8 r = src_argb[2];
 165     dst_rgb[0] = r;
 166     dst_rgb[1] = g;
 167     dst_rgb[2] = b;
 168     dst_rgb += 3;
 169     src_argb += 4;
 170   }
 171 }
 172
 173 void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
 174   for (int x = 0; x < width - 1; x += 2) {
 175     uint8 b0 = src_argb[0] >> 3;
 176     uint8 g0 = src_argb[1] >> 2;
 177     uint8 r0 = src_argb[2] >> 3;
 178     uint8 b1 = src_argb[4] >> 3;
 179     uint8 g1 = src_argb[5] >> 2;
 180     uint8 r1 = src_argb[6] >> 3;
 181     WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
 182               (b1 << 16) | (g1 << 21) | (r1 << 27));
 183     dst_rgb += 4;
 184     src_argb += 8;
 185   }
 186   if (width & 1) {
 187     uint8 b0 = src_argb[0] >> 3;
 188     uint8 g0 = src_argb[1] >> 2;
 189     uint8 r0 = src_argb[2] >> 3;
 190     *reinterpret_cast<uint16*>(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
 191   }
 192 }
 193
 194 void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
 195   for (int x = 0; x < width - 1; x += 2) {
 196     uint8 b0 = src_argb[0] >> 3;
 197     uint8 g0 = src_argb[1] >> 3;
 198     uint8 r0 = src_argb[2] >> 3;
 199     uint8 a0 = src_argb[3] >> 7;
 200     uint8 b1 = src_argb[4] >> 3;
 201     uint8 g1 = src_argb[5] >> 3;
 202     uint8 r1 = src_argb[6] >> 3;
 203     uint8 a1 = src_argb[7] >> 7;
 204     *reinterpret_cast<uint32*>(dst_rgb) =
 205         b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
 206         (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
 207     dst_rgb += 4;
 208     src_argb += 8;
 209   }
 210   if (width & 1) {
 211     uint8 b0 = src_argb[0] >> 3;
 212     uint8 g0 = src_argb[1] >> 3;
 213     uint8 r0 = src_argb[2] >> 3;
 214     uint8 a0 = src_argb[3] >> 7;
 215     *reinterpret_cast<uint16*>(dst_rgb) =
 216         b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
 217   }
 218 }
 219
 220 void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
 221   for (int x = 0; x < width - 1; x += 2) {
 222     uint8 b0 = src_argb[0] >> 4;
 223     uint8 g0 = src_argb[1] >> 4;
 224     uint8 r0 = src_argb[2] >> 4;
 225     uint8 a0 = src_argb[3] >> 4;
 226     uint8 b1 = src_argb[4] >> 4;
 227     uint8 g1 = src_argb[5] >> 4;
 228     uint8 r1 = src_argb[6] >> 4;
 229     uint8 a1 = src_argb[7] >> 4;
 230     *reinterpret_cast<uint32*>(dst_rgb) =
 231         b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
 232         (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
 233     dst_rgb += 4;
 234     src_argb += 8;
 235   }
 236   if (width & 1) {
 237     uint8 b0 = src_argb[0] >> 4;
 238     uint8 g0 = src_argb[1] >> 4;
 239     uint8 r0 = src_argb[2] >> 4;
 240     uint8 a0 = src_argb[3] >> 4;
 241     *reinterpret_cast<uint16*>(dst_rgb) =
 242         b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
 243   }
 244 }
 245
 246 static __inline int RGBToY(uint8 r, uint8 g, uint8 b) {
 247   return (66 * r + 129 * g +  25 * b + 0x1080) >> 8;
 248 }
 249
 250 static __inline int RGBToU(uint8 r, uint8 g, uint8 b) {
 251   return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
 252 }
 253 static __inline int RGBToV(uint8 r, uint8 g, uint8 b) {
 254   return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
 255 }
 256
 257 #define MAKEROWY(NAME, R, G, B, BPP) \
 258 void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {       \
 259   for (int x = 0; x < width; ++x) {                                            \
 260     dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);               \
 261     src_argb0 += BPP;                                                          \
 262     dst_y += 1;                                                                \
 263   }                                                                            \
 264 }                                                                              \
 265 void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb,              \
 266                        uint8* dst_u, uint8* dst_v, int width) {                \
 267   const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
 268   for (int x = 0; x < width - 1; x += 2) {                                     \
 269     uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] +                              \
 270                src_rgb1[B] + src_rgb1[B + BPP]) >> 2;                          \
 271     uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] +                              \
 272                src_rgb1[G] + src_rgb1[G + BPP]) >> 2;                          \
 273     uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] +                              \
 274                src_rgb1[R] + src_rgb1[R + BPP]) >> 2;                          \
 275     dst_u[0] = RGBToU(ar, ag, ab);                                             \
 276     dst_v[0] = RGBToV(ar, ag, ab);                                             \
 277     src_rgb0 += BPP * 2;                                                       \
 278     src_rgb1 += BPP * 2;                                                       \
 279     dst_u += 1;                                                                \
 280     dst_v += 1;                                                                \
 281   }                                                                            \
 282   if (width & 1) {                                                             \
 283     uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                               \
 284     uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                               \
 285     uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                               \
 286     dst_u[0] = RGBToU(ar, ag, ab);                                             \
 287     dst_v[0] = RGBToV(ar, ag, ab);                                             \
 288   }                                                                            \
 289 }
 290
 291 MAKEROWY(ARGB, 2, 1, 0, 4)
 292 MAKEROWY(BGRA, 1, 2, 3, 4)
 293 MAKEROWY(ABGR, 0, 1, 2, 4)
 294 MAKEROWY(RGBA, 3, 2, 1, 4)
 295 MAKEROWY(RGB24, 2, 1, 0, 3)
 296 MAKEROWY(RAW, 0, 1, 2, 3)
 297 #undef MAKEROWY
 298
 299 // JPeg uses a variation on BT.601-1 full range
 300 // y =  0.29900 * r + 0.58700 * g + 0.11400 * b
 301 // u = -0.16874 * r - 0.33126 * g + 0.50000 * b  + center
 302 // v =  0.50000 * r - 0.41869 * g - 0.08131 * b  + center
 303 // BT.601 Mpeg range uses:
 304 // b 0.1016 * 255 = 25.908 = 25
 305 // g 0.5078 * 255 = 129.489 = 129
 306 // r 0.2578 * 255 = 65.739 = 66
 307 // JPeg 8 bit Y (not used):
 308 // b 0.11400 * 256 = 29.184 = 29
 309 // g 0.58700 * 256 = 150.272 = 150
 310 // r 0.29900 * 256 = 76.544 = 77
 311 // JPeg 7 bit Y:
 312 // b 0.11400 * 128 = 14.592 = 15
 313 // g 0.58700 * 128 = 75.136 = 75
 314 // r 0.29900 * 128 = 38.272 = 38
 315 // JPeg 8 bit U:
 316 // b  0.50000 * 255 = 127.5 = 127
 317 // g -0.33126 * 255 = -84.4713 = -84
 318 // r -0.16874 * 255 = -43.0287 = -43
 319 // JPeg 8 bit V:
 320 // b -0.08131 * 255 = -20.73405 = -20
 321 // g -0.41869 * 255 = -106.76595 = -107
 322 // r  0.50000 * 255 = 127.5 = 127
 323
 324 static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) {
 325   return (38 * r + 75 * g +  15 * b + 64) >> 7;
 326 }
 327
 328 static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) {
 329   return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
 330 }
 331 static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) {
 332   return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
 333 }
 334
 335 #define AVGB(a, b) (((a) + (b) + 1) >> 1)
 336
 337 #define MAKEROWYJ(NAME, R, G, B, BPP) \
 338 void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) {      \
 339   for (int x = 0; x < width; ++x) {                                            \
 340     dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);              \
 341     src_argb0 += BPP;                                                          \
 342     dst_y += 1;                                                                \
 343   }                                                                            \
 344 }                                                                              \
 345 void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb,             \
 346                         uint8* dst_u, uint8* dst_v, int width) {               \
 347   const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
 348   for (int x = 0; x < width - 1; x += 2) {                                     \
 349     uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                            \
 350                     AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));               \
 351     uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                            \
 352                     AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));               \
 353     uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                            \
 354                     AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));               \
 355     dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
 356     dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
 357     src_rgb0 += BPP * 2;                                                       \
 358     src_rgb1 += BPP * 2;                                                       \
 359     dst_u += 1;                                                                \
 360     dst_v += 1;                                                                \
 361   }                                                                            \
 362   if (width & 1) {                                                             \
 363     uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]);                                 \
 364     uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]);                                 \
 365     uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]);                                 \
 366     dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
 367     dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
 368   }                                                                            \
 369 }
 370
 371 MAKEROWYJ(ARGB, 2, 1, 0, 4)
 372 #undef MAKEROWYJ
 373
 374 void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
 375   for (int x = 0; x < width; ++x) {
 376     uint8 b = src_rgb565[0] & 0x1f;
 377     uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
 378     uint8 r = src_rgb565[1] >> 3;
 379     b = (b << 3) | (b >> 2);
 380     g = (g << 2) | (g >> 4);
 381     r = (r << 3) | (r >> 2);
 382     dst_y[0] = RGBToY(r, g, b);
 383     src_rgb565 += 2;
 384     dst_y += 1;
 385   }
 386 }
 387
 388 void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) {
 389   for (int x = 0; x < width; ++x) {
 390     uint8 b = src_argb1555[0] & 0x1f;
 391     uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
 392     uint8 r = (src_argb1555[1] & 0x7c) >> 2;
 393     b = (b << 3) | (b >> 2);
 394     g = (g << 3) | (g >> 2);
 395     r = (r << 3) | (r >> 2);
 396     dst_y[0] = RGBToY(r, g, b);
 397     src_argb1555 += 2;
 398     dst_y += 1;
 399   }
 400 }
 401
 402 void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) {
 403   for (int x = 0; x < width; ++x) {
 404     uint8 b = src_argb4444[0] & 0x0f;
 405     uint8 g = src_argb4444[0] >> 4;
 406     uint8 r = src_argb4444[1] & 0x0f;
 407     b = (b << 4) | b;
 408     g = (g << 4) | g;
 409     r = (r << 4) | r;
 410     dst_y[0] = RGBToY(r, g, b);
 411     src_argb4444 += 2;
 412     dst_y += 1;
 413   }
 414 }
 415
 416 void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
 417                      uint8* dst_u, uint8* dst_v, int width) {
 418   const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565;
 419   for (int x = 0; x < width - 1; x += 2) {
 420     uint8 b0 = src_rgb565[0] & 0x1f;
 421     uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
 422     uint8 r0 = src_rgb565[1] >> 3;
 423     uint8 b1 = src_rgb565[2] & 0x1f;
 424     uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
 425     uint8 r1 = src_rgb565[3] >> 3;
 426     uint8 b2 = next_rgb565[0] & 0x1f;
 427     uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
 428     uint8 r2 = next_rgb565[1] >> 3;
 429     uint8 b3 = next_rgb565[2] & 0x1f;
 430     uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
 431     uint8 r3 = next_rgb565[3] >> 3;
 432     uint8 b = (b0 + b1 + b2 + b3);  // 565 * 4 = 787.
 433     uint8 g = (g0 + g1 + g2 + g3);
 434     uint8 r = (r0 + r1 + r2 + r3);
 435     b = (b << 1) | (b >> 6);  // 787 -> 888.
 436     r = (r << 1) | (r >> 6);
 437     dst_u[0] = RGBToU(r, g, b);
 438     dst_v[0] = RGBToV(r, g, b);
 439     src_rgb565 += 4;
 440     next_rgb565 += 4;
 441     dst_u += 1;
 442     dst_v += 1;
 443   }
 444   if (width & 1) {
 445     uint8 b0 = src_rgb565[0] & 0x1f;
 446     uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
 447     uint8 r0 = src_rgb565[1] >> 3;
 448     uint8 b2 = next_rgb565[0] & 0x1f;
 449     uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
 450     uint8 r2 = next_rgb565[1] >> 3;
 451     uint8 b = (b0 + b2);  // 565 * 2 = 676.
 452     uint8 g = (g0 + g2);
 453     uint8 r = (r0 + r2);
 454     b = (b << 2) | (b >> 4);  // 676 -> 888
 455     g = (g << 1) | (g >> 6);
 456     r = (r << 2) | (r >> 4);
 457     dst_u[0] = RGBToU(r, g, b);
 458     dst_v[0] = RGBToV(r, g, b);
 459   }
 460 }
 461
 462 void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
 463                        uint8* dst_u, uint8* dst_v, int width) {
 464   const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555;
 465   for (int x = 0; x < width - 1; x += 2) {
 466     uint8 b0 = src_argb1555[0] & 0x1f;
 467     uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
 468     uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
 469     uint8 b1 = src_argb1555[2] & 0x1f;
 470     uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
 471     uint8 r1 = (src_argb1555[3] & 0x7c) >> 2;
 472     uint8 b2 = next_argb1555[0] & 0x1f;
 473     uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
 474     uint8 r2 = (next_argb1555[1] & 0x7c) >> 2;
 475     uint8 b3 = next_argb1555[2] & 0x1f;
 476     uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
 477     uint8 r3 = (next_argb1555[3] & 0x7c) >> 2;
 478     uint8 b = (b0 + b1 + b2 + b3);  // 555 * 4 = 777.
 479     uint8 g = (g0 + g1 + g2 + g3);
 480     uint8 r = (r0 + r1 + r2 + r3);
 481     b = (b << 1) | (b >> 6);  // 777 -> 888.
 482     g = (g << 1) | (g >> 6);
 483     r = (r << 1) | (r >> 6);
 484     dst_u[0] = RGBToU(r, g, b);
 485     dst_v[0] = RGBToV(r, g, b);
 486     src_argb1555 += 4;
 487     next_argb1555 += 4;
 488     dst_u += 1;
 489     dst_v += 1;
 490   }
 491   if (width & 1) {
 492     uint8 b0 = src_argb1555[0] & 0x1f;
 493     uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
 494     uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
 495     uint8 b2 = next_argb1555[0] & 0x1f;
 496     uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
 497     uint8 r2 = next_argb1555[1] >> 3;
 498     uint8 b = (b0 + b2);  // 555 * 2 = 666.
 499     uint8 g = (g0 + g2);
 500     uint8 r = (r0 + r2);
 501     b = (b << 2) | (b >> 4);  // 666 -> 888.
 502     g = (g << 2) | (g >> 4);
 503     r = (r << 2) | (r >> 4);
 504     dst_u[0] = RGBToU(r, g, b);
 505     dst_v[0] = RGBToV(r, g, b);
 506   }
 507 }
 508
 509 void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
 510                        uint8* dst_u, uint8* dst_v, int width) {
 511   const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444;
 512   for (int x = 0; x < width - 1; x += 2) {
 513     uint8 b0 = src_argb4444[0] & 0x0f;
 514     uint8 g0 = src_argb4444[0] >> 4;
 515     uint8 r0 = src_argb4444[1] & 0x0f;
 516     uint8 b1 = src_argb4444[2] & 0x0f;
 517     uint8 g1 = src_argb4444[2] >> 4;
 518     uint8 r1 = src_argb4444[3] & 0x0f;
 519     uint8 b2 = next_argb4444[0] & 0x0f;
 520     uint8 g2 = next_argb4444[0] >> 4;
 521     uint8 r2 = next_argb4444[1] & 0x0f;
 522     uint8 b3 = next_argb4444[2] & 0x0f;
 523     uint8 g3 = next_argb4444[2] >> 4;
 524     uint8 r3 = next_argb4444[3] & 0x0f;
 525     uint8 b = (b0 + b1 + b2 + b3);  // 444 * 4 = 666.
 526     uint8 g = (g0 + g1 + g2 + g3);
 527     uint8 r = (r0 + r1 + r2 + r3);
 528     b = (b << 2) | (b >> 4);  // 666 -> 888.
 529     g = (g << 2) | (g >> 4);
 530     r = (r << 2) | (r >> 4);
 531     dst_u[0] = RGBToU(r, g, b);
 532     dst_v[0] = RGBToV(r, g, b);
 533     src_argb4444 += 4;
 534     next_argb4444 += 4;
 535     dst_u += 1;
 536     dst_v += 1;
 537   }
 538   if (width & 1) {
 539     uint8 b0 = src_argb4444[0] & 0x0f;
 540     uint8 g0 = src_argb4444[0] >> 4;
 541     uint8 r0 = src_argb4444[1] & 0x0f;
 542     uint8 b2 = next_argb4444[0] & 0x0f;
 543     uint8 g2 = next_argb4444[0] >> 4;
 544     uint8 r2 = next_argb4444[1] & 0x0f;
 545     uint8 b = (b0 + b2);  // 444 * 2 = 555.
 546     uint8 g = (g0 + g2);
 547     uint8 r = (r0 + r2);
 548     b = (b << 3) | (b >> 2);  // 555 -> 888.
 549     g = (g << 3) | (g >> 2);
 550     r = (r << 3) | (r >> 2);
 551     dst_u[0] = RGBToU(r, g, b);
 552     dst_v[0] = RGBToV(r, g, b);
 553   }
 554 }
 555
 556 void ARGBToUV444Row_C(const uint8* src_argb,
 557                       uint8* dst_u, uint8* dst_v, int width) {
 558   for (int x = 0; x < width; ++x) {
 559     uint8 ab = src_argb[0];
 560     uint8 ag = src_argb[1];
 561     uint8 ar = src_argb[2];
 562     dst_u[0] = RGBToU(ar, ag, ab);
 563     dst_v[0] = RGBToV(ar, ag, ab);
 564     src_argb += 4;
 565     dst_u += 1;
 566     dst_v += 1;
 567   }
 568 }
 569
 570 void ARGBToUV422Row_C(const uint8* src_argb,
 571                       uint8* dst_u, uint8* dst_v, int width) {
 572   for (int x = 0; x < width - 1; x += 2) {
 573     uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
 574     uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
 575     uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
 576     dst_u[0] = RGBToU(ar, ag, ab);
 577     dst_v[0] = RGBToV(ar, ag, ab);
 578     src_argb += 8;
 579     dst_u += 1;
 580     dst_v += 1;
 581   }
 582   if (width & 1) {
 583     uint8 ab = src_argb[0];
 584     uint8 ag = src_argb[1];
 585     uint8 ar = src_argb[2];
 586     dst_u[0] = RGBToU(ar, ag, ab);
 587     dst_v[0] = RGBToV(ar, ag, ab);
 588   }
 589 }
 590
 591 void ARGBToUV411Row_C(const uint8* src_argb,
 592                       uint8* dst_u, uint8* dst_v, int width) {
 593   for (int x = 0; x < width - 3; x += 4) {
 594     uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2;
 595     uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2;
 596     uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2;
 597     dst_u[0] = RGBToU(ar, ag, ab);
 598     dst_v[0] = RGBToV(ar, ag, ab);
 599     src_argb += 16;
 600     dst_u += 1;
 601     dst_v += 1;
 602   }
 603   if ((width & 3) == 3) {
 604     uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8]) / 3;
 605     uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9]) / 3;
 606     uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10]) / 3;
 607     dst_u[0] = RGBToU(ar, ag, ab);
 608     dst_v[0] = RGBToV(ar, ag, ab);
 609   } else if ((width & 3) == 2) {
 610     uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
 611     uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
 612     uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
 613     dst_u[0] = RGBToU(ar, ag, ab);
 614     dst_v[0] = RGBToV(ar, ag, ab);
 615   } else if ((width & 3) == 1) {
 616     uint8 ab = src_argb[0];
 617     uint8 ag = src_argb[1];
 618     uint8 ar = src_argb[2];
 619     dst_u[0] = RGBToU(ar, ag, ab);
 620     dst_v[0] = RGBToV(ar, ag, ab);
 621   }
 622 }
 623
 624 void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
 625   for (int x = 0; x < width; ++x) {
 626     uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
 627     dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
 628     dst_argb[3] = src_argb[3];
 629     dst_argb += 4;
 630     src_argb += 4;
 631   }
 632 }
 633
 634 // Convert a row of image to Sepia tone.
 635 void ARGBSepiaRow_C(uint8* dst_argb, int width) {
 636   for (int x = 0; x < width; ++x) {
 637     int b = dst_argb[0];
 638     int g = dst_argb[1];
 639     int r = dst_argb[2];
 640     int sb = (b * 17 + g * 68 + r * 35) >> 7;
 641     int sg = (b * 22 + g * 88 + r * 45) >> 7;
 642     int sr = (b * 24 + g * 98 + r * 50) >> 7;
 643     // b does not over flow. a is preserved from original.
 644     dst_argb[0] = sb;
 645     dst_argb[1] = clamp255(sg);
 646     dst_argb[2] = clamp255(sr);
 647     dst_argb += 4;
 648   }
 649 }
 650
 651 // Apply color matrix to a row of image. Matrix is signed.
 652 void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width) {
 653   for (int x = 0; x < width; ++x) {
 654     int b = dst_argb[0];
 655     int g = dst_argb[1];
 656     int r = dst_argb[2];
 657     int a = dst_argb[3];
 658     int sb = (b * matrix_argb[0] + g * matrix_argb[1] +
 659               r * matrix_argb[2] + a * matrix_argb[3]) >> 7;
 660     int sg = (b * matrix_argb[4] + g * matrix_argb[5] +
 661               r * matrix_argb[6] + a * matrix_argb[7]) >> 7;
 662     int sr = (b * matrix_argb[8] + g * matrix_argb[9] +
 663               r * matrix_argb[10] + a * matrix_argb[11]) >> 7;
 664     dst_argb[0] = Clamp(sb);
 665     dst_argb[1] = Clamp(sg);
 666     dst_argb[2] = Clamp(sr);
 667     dst_argb += 4;
 668   }
 669 }
 670
 671 // Apply color table to a row of image.
 672 void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
 673   for (int x = 0; x < width; ++x) {
 674     int b = dst_argb[0];
 675     int g = dst_argb[1];
 676     int r = dst_argb[2];
 677     int a = dst_argb[3];
 678     dst_argb[0] = table_argb[b * 4 + 0];
 679     dst_argb[1] = table_argb[g * 4 + 1];
 680     dst_argb[2] = table_argb[r * 4 + 2];
 681     dst_argb[3] = table_argb[a * 4 + 3];
 682     dst_argb += 4;
 683   }
 684 }
 685
 686 void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
 687                        int interval_offset, int width) {
 688   for (int x = 0; x < width; ++x) {
 689     int b = dst_argb[0];
 690     int g = dst_argb[1];
 691     int r = dst_argb[2];
 692     dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
 693     dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;
 694     dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;
 695     dst_argb += 4;
 696   }
 697 }
 698
 699 #define REPEAT8(v) (v) | ((v) << 8)
 700 #define SHADE(f, v) v * f >> 24
 701
 702 void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
 703                     uint32 value) {
 704   const uint32 b_scale = REPEAT8(value & 0xff);
 705   const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
 706   const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
 707   const uint32 a_scale = REPEAT8(value >> 24);
 708
 709   for (int i = 0; i < width; ++i) {
 710     const uint32 b = REPEAT8(src_argb[0]);
 711     const uint32 g = REPEAT8(src_argb[1]);
 712     const uint32 r = REPEAT8(src_argb[2]);
 713     const uint32 a = REPEAT8(src_argb[3]);
 714     dst_argb[0] = SHADE(b, b_scale);
 715     dst_argb[1] = SHADE(g, g_scale);
 716     dst_argb[2] = SHADE(r, r_scale);
 717     dst_argb[3] = SHADE(a, a_scale);
 718     src_argb += 4;
 719     dst_argb += 4;
 720   }
 721 }
 722 #undef REPEAT8
 723 #undef SHADE
 724
 725 #define REPEAT8(v) (v) | ((v) << 8)
 726 #define SHADE(f, v) v * f >> 16
 727
 728 void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
 729                        uint8* dst_argb, int width) {
 730   for (int i = 0; i < width; ++i) {
 731     const uint32 b = REPEAT8(src_argb0[0]);
 732     const uint32 g = REPEAT8(src_argb0[1]);
 733     const uint32 r = REPEAT8(src_argb0[2]);
 734     const uint32 a = REPEAT8(src_argb0[3]);
 735     const uint32 b_scale = src_argb1[0];
 736     const uint32 g_scale = src_argb1[1];
 737     const uint32 r_scale = src_argb1[2];
 738     const uint32 a_scale = src_argb1[3];
 739     dst_argb[0] = SHADE(b, b_scale);
 740     dst_argb[1] = SHADE(g, g_scale);
 741     dst_argb[2] = SHADE(r, r_scale);
 742     dst_argb[3] = SHADE(a, a_scale);
 743     src_argb0 += 4;
 744     src_argb1 += 4;
 745     dst_argb += 4;
 746   }
 747 }
 748 #undef REPEAT8
 749 #undef SHADE
 750
 751 #define SHADE(f, v) clamp255(v + f)
 752
 753 void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,
 754                   uint8* dst_argb, int width) {
 755   for (int i = 0; i < width; ++i) {
 756     const int b = src_argb0[0];
 757     const int g = src_argb0[1];
 758     const int r = src_argb0[2];
 759     const int a = src_argb0[3];
 760     const int b_add = src_argb1[0];
 761     const int g_add = src_argb1[1];
 762     const int r_add = src_argb1[2];
 763     const int a_add = src_argb1[3];
 764     dst_argb[0] = SHADE(b, b_add);
 765     dst_argb[1] = SHADE(g, g_add);
 766     dst_argb[2] = SHADE(r, r_add);
 767     dst_argb[3] = SHADE(a, a_add);
 768     src_argb0 += 4;
 769     src_argb1 += 4;
 770     dst_argb += 4;
 771   }
 772 }
 773 #undef SHADE
 774
 775 #define SHADE(f, v) clamp0(f - v)
 776
 777 void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1,
 778                        uint8* dst_argb, int width) {
 779   for (int i = 0; i < width; ++i) {
 780     const int b = src_argb0[0];
 781     const int g = src_argb0[1];
 782     const int r = src_argb0[2];
 783     const int a = src_argb0[3];
 784     const int b_sub = src_argb1[0];
 785     const int g_sub = src_argb1[1];
 786     const int r_sub = src_argb1[2];
 787     const int a_sub = src_argb1[3];
 788     dst_argb[0] = SHADE(b, b_sub);
 789     dst_argb[1] = SHADE(g, g_sub);
 790     dst_argb[2] = SHADE(r, r_sub);
 791     dst_argb[3] = SHADE(a, a_sub);
 792     src_argb0 += 4;
 793     src_argb1 += 4;
 794     dst_argb += 4;
 795   }
 796 }
 797 #undef SHADE
 798
 799 // Sobel functions which mimics SSSE3.
 800 void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
 801                  uint8* dst_sobelx, int width) {
 802   for (int i = 0; i < width; ++i) {
 803     int a = src_y0[i];
 804     int b = src_y1[i];
 805     int c = src_y2[i];
 806     int a_sub = src_y0[i + 2];
 807     int b_sub = src_y1[i + 2];
 808     int c_sub = src_y2[i + 2];
 809     int a_diff = a - a_sub;
 810     int b_diff = b - b_sub;
 811     int c_diff = c - c_sub;
 812     int sobel = Abs(a_diff + b_diff * 2 + c_diff);
 813     dst_sobelx[i] = static_cast<uint8>(clamp255(sobel));
 814   }
 815 }
 816
 817 void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
 818                  uint8* dst_sobely, int width) {
 819   for (int i = 0; i < width; ++i) {
 820     int a = src_y0[i + 0];
 821     int b = src_y0[i + 1];
 822     int c = src_y0[i + 2];
 823     int a_sub = src_y1[i + 0];
 824     int b_sub = src_y1[i + 1];
 825     int c_sub = src_y1[i + 2];
 826     int a_diff = a - a_sub;
 827     int b_diff = b - b_sub;
 828     int c_diff = c - c_sub;
 829     int sobel = Abs(a_diff + b_diff * 2 + c_diff);
 830     dst_sobely[i] = static_cast<uint8>(clamp255(sobel));
 831   }
 832 }
 833
 834 void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
 835                 uint8* dst_argb, int width) {
 836   for (int i = 0; i < width; ++i) {
 837     int r = src_sobelx[i];
 838     int b = src_sobely[i];
 839     int s = clamp255(r + b);
 840     dst_argb[0] = static_cast<uint8>(s);
 841     dst_argb[1] = static_cast<uint8>(s);
 842     dst_argb[2] = static_cast<uint8>(s);
 843     dst_argb[3] = static_cast<uint8>(255u);
 844     dst_argb += 4;
 845   }
 846 }
 847
 848 void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
 849                   uint8* dst_argb, int width) {
 850   for (int i = 0; i < width; ++i) {
 851     int r = src_sobelx[i];
 852     int b = src_sobely[i];
 853     int g = clamp255(r + b);
 854     dst_argb[0] = static_cast<uint8>(b);
 855     dst_argb[1] = static_cast<uint8>(g);
 856     dst_argb[2] = static_cast<uint8>(r);
 857     dst_argb[3] = static_cast<uint8>(255u);
 858     dst_argb += 4;
 859   }
 860 }
 861
 862 void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
 863   // Copy a Y to RGB.
 864   for (int x = 0; x < width; ++x) {
 865     uint8 y = src_y[0];
 866     dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
 867     dst_argb[3] = 255u;
 868     dst_argb += 4;
 869     ++src_y;
 870   }
 871 }
 872
 873 // C reference code that mimics the YUV assembly.
 874
 875 #define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
 876
 877 #define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
 878 #define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
 879 #define UR 0
 880
 881 #define VB 0
 882 #define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
 883 #define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
 884
 885 // Bias
 886 #define BB UB * 128 + VB * 128
 887 #define BG UG * 128 + VG * 128
 888 #define BR UR * 128 + VR * 128
 889
 890 static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
 891                               uint8* b, uint8* g, uint8* r) {
 892   int32 y1 = (static_cast<int32>(y) - 16) * YG;
 893   *b = Clamp(static_cast<int32>((u * UB + v * VB) - (BB) + y1) >> 6);
 894   *g = Clamp(static_cast<int32>((u * UG + v * VG) - (BG) + y1) >> 6);
 895   *r = Clamp(static_cast<int32>((u * UR + v * VR) - (BR) + y1) >> 6);
 896 }
 897
 898 #if !defined(LIBYUV_DISABLE_NEON) && \
 899     (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 900 // C mimic assembly.
 901 // TODO(fbarchard): Remove subsampling from Neon.
 902 void I444ToARGBRow_C(const uint8* src_y,
 903                      const uint8* src_u,
 904                      const uint8* src_v,
 905                      uint8* rgb_buf,
 906                      int width) {
 907   for (int x = 0; x < width - 1; x += 2) {
 908     uint8 u = (src_u[0] + src_u[1] + 1) >> 1;
 909     uint8 v = (src_v[0] + src_v[1] + 1) >> 1;
 910     YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
 911     rgb_buf[3] = 255;
 912     YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
 913     rgb_buf[7] = 255;
 914     src_y += 2;
 915     src_u += 2;
 916     src_v += 2;
 917     rgb_buf += 8;  // Advance 2 pixels.
 918   }
 919   if (width & 1) {
 920     YuvPixel(src_y[0], src_u[0], src_v[0],
 921              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
 922   }
 923 }
 924 #else
 925 void I444ToARGBRow_C(const uint8* src_y,
 926                      const uint8* src_u,
 927                      const uint8* src_v,
 928                      uint8* rgb_buf,
 929                      int width) {
 930   for (int x = 0; x < width; ++x) {
 931     YuvPixel(src_y[0], src_u[0], src_v[0],
 932              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
 933     rgb_buf[3] = 255;
 934     src_y += 1;
 935     src_u += 1;
 936     src_v += 1;
 937     rgb_buf += 4;  // Advance 1 pixel.
 938   }
 939 }
 940 #endif
 941 // Also used for 420
 942 void I422ToARGBRow_C(const uint8* src_y,
 943                      const uint8* src_u,
 944                      const uint8* src_v,
 945                      uint8* rgb_buf,
 946                      int width) {
 947   for (int x = 0; x < width - 1; x += 2) {
 948     YuvPixel(src_y[0], src_u[0], src_v[0],
 949              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
 950     rgb_buf[3] = 255;
 951     YuvPixel(src_y[1], src_u[0], src_v[0],
 952              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
 953     rgb_buf[7] = 255;
 954     src_y += 2;
 955     src_u += 1;
 956     src_v += 1;
 957     rgb_buf += 8;  // Advance 2 pixels.
 958   }
 959   if (width & 1) {
 960     YuvPixel(src_y[0], src_u[0], src_v[0],
 961              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
 962     rgb_buf[3] = 255;
 963   }
 964 }
 965
 966 void I422ToRGB24Row_C(const uint8* src_y,
 967                       const uint8* src_u,
 968                       const uint8* src_v,
 969                       uint8* rgb_buf,
 970                       int width) {
 971   for (int x = 0; x < width - 1; x += 2) {
 972     YuvPixel(src_y[0], src_u[0], src_v[0],
 973              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
 974     YuvPixel(src_y[1], src_u[0], src_v[0],
 975              rgb_buf + 3, rgb_buf + 4, rgb_buf + 5);
 976     src_y += 2;
 977     src_u += 1;
 978     src_v += 1;
 979     rgb_buf += 6;  // Advance 2 pixels.
 980   }
 981   if (width & 1) {
 982     YuvPixel(src_y[0], src_u[0], src_v[0],
 983              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
 984   }
 985 }
 986
 987 void I422ToRAWRow_C(const uint8* src_y,
 988                     const uint8* src_u,
 989                     const uint8* src_v,
 990                     uint8* rgb_buf,
 991                     int width) {
 992   for (int x = 0; x < width - 1; x += 2) {
 993     YuvPixel(src_y[0], src_u[0], src_v[0],
 994              rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
 995     YuvPixel(src_y[1], src_u[0], src_v[0],
 996              rgb_buf + 5, rgb_buf + 4, rgb_buf + 3);
 997     src_y += 2;
 998     src_u += 1;
 999     src_v += 1;
1000     rgb_buf += 6;  // Advance 2 pixels.
1001   }
1002   if (width & 1) {
1003     YuvPixel(src_y[0], src_u[0], src_v[0],
1004              rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
1005   }
1006 }
1007
1008 void I422ToARGB4444Row_C(const uint8* src_y,
1009                          const uint8* src_u,
1010                          const uint8* src_v,
1011                          uint8* dst_argb4444,
1012                          int width) {
1013   uint8 b0;
1014   uint8 g0;
1015   uint8 r0;
1016   uint8 b1;
1017   uint8 g1;
1018   uint8 r1;
1019   for (int x = 0; x < width - 1; x += 2) {
1020     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
1021     YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
1022     b0 = b0 >> 4;
1023     g0 = g0 >> 4;
1024     r0 = r0 >> 4;
1025     b1 = b1 >> 4;
1026     g1 = g1 >> 4;
1027     r1 = r1 >> 4;
1028     *reinterpret_cast<uint32*>(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
1029         (b1 << 16) | (g1 << 20) | (r1 << 24) | 0xf000f000;
1030     src_y += 2;
1031     src_u += 1;
1032     src_v += 1;
1033     dst_argb4444 += 4;  // Advance 2 pixels.
1034   }
1035   if (width & 1) {
1036     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
1037     b0 = b0 >> 4;
1038     g0 = g0 >> 4;
1039     r0 = r0 >> 4;
1040     *reinterpret_cast<uint16*>(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
1041         0xf000;
1042   }
1043 }
1044
1045 void I422ToARGB1555Row_C(const uint8* src_y,
1046                          const uint8* src_u,
1047                          const uint8* src_v,
1048                          uint8* dst_argb1555,
1049                          int width) {
1050   uint8 b0;
1051   uint8 g0;
1052   uint8 r0;
1053   uint8 b1;
1054   uint8 g1;
1055   uint8 r1;
1056   for (int x = 0; x < width - 1; x += 2) {
1057     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
1058     YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
1059     b0 = b0 >> 3;
1060     g0 = g0 >> 3;
1061     r0 = r0 >> 3;
1062     b1 = b1 >> 3;
1063     g1 = g1 >> 3;
1064     r1 = r1 >> 3;
1065     *reinterpret_cast<uint32*>(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
1066         (b1 << 16) | (g1 << 21) | (r1 << 26) | 0x80008000;
1067     src_y += 2;
1068     src_u += 1;
1069     src_v += 1;
1070     dst_argb1555 += 4;  // Advance 2 pixels.
1071   }
1072   if (width & 1) {
1073     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
1074     b0 = b0 >> 3;
1075     g0 = g0 >> 3;
1076     r0 = r0 >> 3;
1077     *reinterpret_cast<uint16*>(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
1078         0x8000;
1079   }
1080 }
1081
1082 void I422ToRGB565Row_C(const uint8* src_y,
1083                        const uint8* src_u,
1084                        const uint8* src_v,
1085                        uint8* dst_rgb565,
1086                        int width) {
1087   uint8 b0;
1088   uint8 g0;
1089   uint8 r0;
1090   uint8 b1;
1091   uint8 g1;
1092   uint8 r1;
1093   for (int x = 0; x < width - 1; x += 2) {
1094     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
1095     YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
1096     b0 = b0 >> 3;
1097     g0 = g0 >> 2;
1098     r0 = r0 >> 3;
1099     b1 = b1 >> 3;
1100     g1 = g1 >> 2;
1101     r1 = r1 >> 3;
1102     *reinterpret_cast<uint32*>(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
1103         (b1 << 16) | (g1 << 21) | (r1 << 27);
1104     src_y += 2;
1105     src_u += 1;
1106     src_v += 1;
1107     dst_rgb565 += 4;  // Advance 2 pixels.
1108   }
1109   if (width & 1) {
1110     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
1111     b0 = b0 >> 3;
1112     g0 = g0 >> 2;
1113     r0 = r0 >> 3;
1114     *reinterpret_cast<uint16*>(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
1115   }
1116 }
1117
1118 void I411ToARGBRow_C(const uint8* src_y,
1119                      const uint8* src_u,
1120                      const uint8* src_v,
1121                      uint8* rgb_buf,
1122                      int width) {
1123   for (int x = 0; x < width - 3; x += 4) {
1124     YuvPixel(src_y[0], src_u[0], src_v[0],
1125              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1126     rgb_buf[3] = 255;
1127     YuvPixel(src_y[1], src_u[0], src_v[0],
1128              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1129     rgb_buf[7] = 255;
1130     YuvPixel(src_y[2], src_u[0], src_v[0],
1131              rgb_buf + 8, rgb_buf + 9, rgb_buf + 10);
1132     rgb_buf[11] = 255;
1133     YuvPixel(src_y[3], src_u[0], src_v[0],
1134              rgb_buf + 12, rgb_buf + 13, rgb_buf + 14);
1135     rgb_buf[15] = 255;
1136     src_y += 4;
1137     src_u += 1;
1138     src_v += 1;
1139     rgb_buf += 16;  // Advance 4 pixels.
1140   }
1141   if (width & 2) {
1142     YuvPixel(src_y[0], src_u[0], src_v[0],
1143              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1144     rgb_buf[3] = 255;
1145     YuvPixel(src_y[1], src_u[0], src_v[0],
1146              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1147     rgb_buf[7] = 255;
1148     src_y += 2;
1149     rgb_buf += 8;  // Advance 2 pixels.
1150   }
1151   if (width & 1) {
1152     YuvPixel(src_y[0], src_u[0], src_v[0],
1153              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1154     rgb_buf[3] = 255;
1155   }
1156 }
1157
1158 void NV12ToARGBRow_C(const uint8* src_y,
1159                      const uint8* usrc_v,
1160                      uint8* rgb_buf,
1161                      int width) {
1162   for (int x = 0; x < width - 1; x += 2) {
1163     YuvPixel(src_y[0], usrc_v[0], usrc_v[1],
1164              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1165     rgb_buf[3] = 255;
1166     YuvPixel(src_y[1], usrc_v[0], usrc_v[1],
1167              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1168     rgb_buf[7] = 255;
1169     src_y += 2;
1170     usrc_v += 2;
1171     rgb_buf += 8;  // Advance 2 pixels.
1172   }
1173   if (width & 1) {
1174     YuvPixel(src_y[0], usrc_v[0], usrc_v[1],
1175              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1176     rgb_buf[3] = 255;
1177   }
1178 }
1179
1180 void NV21ToARGBRow_C(const uint8* src_y,
1181                      const uint8* src_vu,
1182                      uint8* rgb_buf,
1183                      int width) {
1184   for (int x = 0; x < width - 1; x += 2) {
1185     YuvPixel(src_y[0], src_vu[1], src_vu[0],
1186              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1187     rgb_buf[3] = 255;
1188
1189     YuvPixel(src_y[1], src_vu[1], src_vu[0],
1190              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1191     rgb_buf[7] = 255;
1192
1193     src_y += 2;
1194     src_vu += 2;
1195     rgb_buf += 8;  // Advance 2 pixels.
1196   }
1197   if (width & 1) {
1198     YuvPixel(src_y[0], src_vu[1], src_vu[0],
1199              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1200     rgb_buf[3] = 255;
1201   }
1202 }
1203
1204 void NV12ToRGB565Row_C(const uint8* src_y,
1205                        const uint8* usrc_v,
1206                        uint8* dst_rgb565,
1207                        int width) {
1208   uint8 b0;
1209   uint8 g0;
1210   uint8 r0;
1211   uint8 b1;
1212   uint8 g1;
1213   uint8 r1;
1214   for (int x = 0; x < width - 1; x += 2) {
1215     YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0);
1216     YuvPixel(src_y[1], usrc_v[0], usrc_v[1], &b1, &g1, &r1);
1217     b0 = b0 >> 3;
1218     g0 = g0 >> 2;
1219     r0 = r0 >> 3;
1220     b1 = b1 >> 3;
1221     g1 = g1 >> 2;
1222     r1 = r1 >> 3;
1223     *reinterpret_cast<uint32*>(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
1224         (b1 << 16) | (g1 << 21) | (r1 << 27);
1225     src_y += 2;
1226     usrc_v += 2;
1227     dst_rgb565 += 4;  // Advance 2 pixels.
1228   }
1229   if (width & 1) {
1230     YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0);
1231     b0 = b0 >> 3;
1232     g0 = g0 >> 2;
1233     r0 = r0 >> 3;
1234     *reinterpret_cast<uint16*>(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
1235   }
1236 }
1237
1238 void NV21ToRGB565Row_C(const uint8* src_y,
1239                        const uint8* vsrc_u,
1240                        uint8* dst_rgb565,
1241                        int width) {
1242   uint8 b0;
1243   uint8 g0;
1244   uint8 r0;
1245   uint8 b1;
1246   uint8 g1;
1247   uint8 r1;
1248   for (int x = 0; x < width - 1; x += 2) {
1249     YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);
1250     YuvPixel(src_y[1], vsrc_u[1], vsrc_u[0], &b1, &g1, &r1);
1251     b0 = b0 >> 3;
1252     g0 = g0 >> 2;
1253     r0 = r0 >> 3;
1254     b1 = b1 >> 3;
1255     g1 = g1 >> 2;
1256     r1 = r1 >> 3;
1257     *reinterpret_cast<uint32*>(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
1258         (b1 << 16) | (g1 << 21) | (r1 << 27);
1259     src_y += 2;
1260     vsrc_u += 2;
1261     dst_rgb565 += 4;  // Advance 2 pixels.
1262   }
1263   if (width & 1) {
1264     YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);
1265     b0 = b0 >> 3;
1266     g0 = g0 >> 2;
1267     r0 = r0 >> 3;
1268     *reinterpret_cast<uint16*>(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
1269   }
1270 }
1271
1272 void YUY2ToARGBRow_C(const uint8* src_yuy2,
1273                      uint8* rgb_buf,
1274                      int width) {
1275   for (int x = 0; x < width - 1; x += 2) {
1276     YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
1277              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1278     rgb_buf[3] = 255;
1279     YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3],
1280              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1281     rgb_buf[7] = 255;
1282     src_yuy2 += 4;
1283     rgb_buf += 8;  // Advance 2 pixels.
1284   }
1285   if (width & 1) {
1286     YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
1287              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1288     rgb_buf[3] = 255;
1289   }
1290 }
1291
1292 void UYVYToARGBRow_C(const uint8* src_uyvy,
1293                      uint8* rgb_buf,
1294                      int width) {
1295   for (int x = 0; x < width - 1; x += 2) {
1296     YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
1297              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1298     rgb_buf[3] = 255;
1299     YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2],
1300              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1301     rgb_buf[7] = 255;
1302     src_uyvy += 4;
1303     rgb_buf += 8;  // Advance 2 pixels.
1304   }
1305   if (width & 1) {
1306     YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
1307              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1308     rgb_buf[3] = 255;
1309   }
1310 }
1311
1312 void I422ToBGRARow_C(const uint8* src_y,
1313                      const uint8* src_u,
1314                      const uint8* src_v,
1315                      uint8* rgb_buf,
1316                      int width) {
1317   for (int x = 0; x < width - 1; x += 2) {
1318     YuvPixel(src_y[0], src_u[0], src_v[0],
1319              rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);
1320     rgb_buf[0] = 255;
1321     YuvPixel(src_y[1], src_u[0], src_v[0],
1322              rgb_buf + 7, rgb_buf + 6, rgb_buf + 5);
1323     rgb_buf[4] = 255;
1324     src_y += 2;
1325     src_u += 1;
1326     src_v += 1;
1327     rgb_buf += 8;  // Advance 2 pixels.
1328   }
1329   if (width & 1) {
1330     YuvPixel(src_y[0], src_u[0], src_v[0],
1331              rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);
1332     rgb_buf[0] = 255;
1333   }
1334 }
1335
1336 void I422ToABGRRow_C(const uint8* src_y,
1337                      const uint8* src_u,
1338                      const uint8* src_v,
1339                      uint8* rgb_buf,
1340                      int width) {
1341   for (int x = 0; x < width - 1; x += 2) {
1342     YuvPixel(src_y[0], src_u[0], src_v[0],
1343              rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
1344     rgb_buf[3] = 255;
1345     YuvPixel(src_y[1], src_u[0], src_v[0],
1346              rgb_buf + 6, rgb_buf + 5, rgb_buf + 4);
1347     rgb_buf[7] = 255;
1348     src_y += 2;
1349     src_u += 1;
1350     src_v += 1;
1351     rgb_buf += 8;  // Advance 2 pixels.
1352   }
1353   if (width & 1) {
1354     YuvPixel(src_y[0], src_u[0], src_v[0],
1355              rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
1356     rgb_buf[3] = 255;
1357   }
1358 }
1359
1360 void I422ToRGBARow_C(const uint8* src_y,
1361                      const uint8* src_u,
1362                      const uint8* src_v,
1363                      uint8* rgb_buf,
1364                      int width) {
1365   for (int x = 0; x < width - 1; x += 2) {
1366     YuvPixel(src_y[0], src_u[0], src_v[0],
1367              rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);
1368     rgb_buf[0] = 255;
1369     YuvPixel(src_y[1], src_u[0], src_v[0],
1370              rgb_buf + 5, rgb_buf + 6, rgb_buf + 7);
1371     rgb_buf[4] = 255;
1372     src_y += 2;
1373     src_u += 1;
1374     src_v += 1;
1375     rgb_buf += 8;  // Advance 2 pixels.
1376   }
1377   if (width & 1) {
1378     YuvPixel(src_y[0], src_u[0], src_v[0],
1379              rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);
1380     rgb_buf[0] = 255;
1381   }
1382 }
1383
1384 void YToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
1385   for (int x = 0; x < width - 1; x += 2) {
1386     YuvPixel(src_y[0], 128, 128,
1387              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1388     rgb_buf[3] = 255;
1389     YuvPixel(src_y[1], 128, 128,
1390              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1391     rgb_buf[7] = 255;
1392     src_y += 2;
1393     rgb_buf += 8;  // Advance 2 pixels.
1394   }
1395   if (width & 1) {
1396     YuvPixel(src_y[0], 128, 128,
1397              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1398     rgb_buf[3] = 255;
1399   }
1400 }
1401
1402 void MirrorRow_C(const uint8* src, uint8* dst, int width) {
1403   src += width - 1;
1404   for (int x = 0; x < width - 1; x += 2) {
1405     dst[x] = src[0];
1406     dst[x + 1] = src[-1];
1407     src -= 2;
1408   }
1409   if (width & 1) {
1410     dst[width - 1] = src[0];
1411   }
1412 }
1413
1414 void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
1415   src_uv += (width - 1) << 1;
1416   for (int x = 0; x < width - 1; x += 2) {
1417     dst_u[x] = src_uv[0];
1418     dst_u[x + 1] = src_uv[-2];
1419     dst_v[x] = src_uv[1];
1420     dst_v[x + 1] = src_uv[-2 + 1];
1421     src_uv -= 4;
1422   }
1423   if (width & 1) {
1424     dst_u[width - 1] = src_uv[0];
1425     dst_v[width - 1] = src_uv[1];
1426   }
1427 }
1428
1429 void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {
1430   const uint32* src32 = reinterpret_cast<const uint32*>(src);
1431   uint32* dst32 = reinterpret_cast<uint32*>(dst);
1432   src32 += width - 1;
1433   for (int x = 0; x < width - 1; x += 2) {
1434     dst32[x] = src32[0];
1435     dst32[x + 1] = src32[-1];
1436     src32 -= 2;
1437   }
1438   if (width & 1) {
1439     dst32[width - 1] = src32[0];
1440   }
1441 }
1442
1443 void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
1444   for (int x = 0; x < width - 1; x += 2) {
1445     dst_u[x] = src_uv[0];
1446     dst_u[x + 1] = src_uv[2];
1447     dst_v[x] = src_uv[1];
1448     dst_v[x + 1] = src_uv[3];
1449     src_uv += 4;
1450   }
1451   if (width & 1) {
1452     dst_u[width - 1] = src_uv[0];
1453     dst_v[width - 1] = src_uv[1];
1454   }
1455 }
1456
1457 void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
1458                   int width) {
1459   for (int x = 0; x < width - 1; x += 2) {
1460     dst_uv[0] = src_u[x];
1461     dst_uv[1] = src_v[x];
1462     dst_uv[2] = src_u[x + 1];
1463     dst_uv[3] = src_v[x + 1];
1464     dst_uv += 4;
1465   }
1466   if (width & 1) {
1467     dst_uv[0] = src_u[width - 1];
1468     dst_uv[1] = src_v[width - 1];
1469   }
1470 }
1471
1472 void CopyRow_C(const uint8* src, uint8* dst, int count) {
1473   memcpy(dst, src, count);
1474 }
1475
1476 void SetRow_C(uint8* dst, uint32 v8, int count) {
1477 #ifdef _MSC_VER
1478   // VC will generate rep stosb.
1479   for (int x = 0; x < count; ++x) {
1480     dst[x] = v8;
1481   }
1482 #else
1483   memset(dst, v8, count);
1484 #endif
1485 }
1486
1487 void ARGBSetRows_C(uint8* dst, uint32 v32, int width,
1488                  int dst_stride, int height) {
1489   for (int y = 0; y < height; ++y) {
1490     uint32* d = reinterpret_cast<uint32*>(dst);
1491     for (int x = 0; x < width; ++x) {
1492       d[x] = v32;
1493     }
1494     dst += dst_stride;
1495   }
1496 }
1497
1498 // Filter 2 rows of YUY2 UV's (422) into U and V (420).
1499 void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
1500                    uint8* dst_u, uint8* dst_v, int width) {
1501   // Output a row of UV values, filtering 2 rows of YUY2.
1502   for (int x = 0; x < width; x += 2) {
1503     dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
1504     dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
1505     src_yuy2 += 4;
1506     dst_u += 1;
1507     dst_v += 1;
1508   }
1509 }
1510
1511 // Copy row of YUY2 UV's (422) into U and V (422).
1512 void YUY2ToUV422Row_C(const uint8* src_yuy2,
1513                       uint8* dst_u, uint8* dst_v, int width) {
1514   // Output a row of UV values.
1515   for (int x = 0; x < width; x += 2) {
1516     dst_u[0] = src_yuy2[1];
1517     dst_v[0] = src_yuy2[3];
1518     src_yuy2 += 4;
1519     dst_u += 1;
1520     dst_v += 1;
1521   }
1522 }
1523
1524 // Copy row of YUY2 Y's (422) into Y (420/422).
1525 void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
1526   // Output a row of Y values.
1527   for (int x = 0; x < width - 1; x += 2) {
1528     dst_y[x] = src_yuy2[0];
1529     dst_y[x + 1] = src_yuy2[2];
1530     src_yuy2 += 4;
1531   }
1532   if (width & 1) {
1533     dst_y[width - 1] = src_yuy2[0];
1534   }
1535 }
1536
1537 // Filter 2 rows of UYVY UV's (422) into U and V (420).
1538 void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
1539                    uint8* dst_u, uint8* dst_v, int width) {
1540   // Output a row of UV values.
1541   for (int x = 0; x < width; x += 2) {
1542     dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
1543     dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
1544     src_uyvy += 4;
1545     dst_u += 1;
1546     dst_v += 1;
1547   }
1548 }
1549
1550 // Copy row of UYVY UV's (422) into U and V (422).
1551 void UYVYToUV422Row_C(const uint8* src_uyvy,
1552                       uint8* dst_u, uint8* dst_v, int width) {
1553   // Output a row of UV values.
1554   for (int x = 0; x < width; x += 2) {
1555     dst_u[0] = src_uyvy[0];
1556     dst_v[0] = src_uyvy[2];
1557     src_uyvy += 4;
1558     dst_u += 1;
1559     dst_v += 1;
1560   }
1561 }
1562
1563 // Copy row of UYVY Y's (422) into Y (420/422).
1564 void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
1565   // Output a row of Y values.
1566   for (int x = 0; x < width - 1; x += 2) {
1567     dst_y[x] = src_uyvy[1];
1568     dst_y[x + 1] = src_uyvy[3];
1569     src_uyvy += 4;
1570   }
1571   if (width & 1) {
1572     dst_y[width - 1] = src_uyvy[1];
1573   }
1574 }
1575
1576 #define BLEND(f, b, a) (((256 - a) * b) >> 8) + f
1577
1578 // Blend src_argb0 over src_argb1 and store to dst_argb.
1579 // dst_argb may be src_argb0 or src_argb1.
1580 // This code mimics the SSSE3 version for better testability.
1581 void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
1582                     uint8* dst_argb, int width) {
1583   for (int x = 0; x < width - 1; x += 2) {
1584     uint32 fb = src_argb0[0];
1585     uint32 fg = src_argb0[1];
1586     uint32 fr = src_argb0[2];
1587     uint32 a = src_argb0[3];
1588     uint32 bb = src_argb1[0];
1589     uint32 bg = src_argb1[1];
1590     uint32 br = src_argb1[2];
1591     dst_argb[0] = BLEND(fb, bb, a);
1592     dst_argb[1] = BLEND(fg, bg, a);
1593     dst_argb[2] = BLEND(fr, br, a);
1594     dst_argb[3] = 255u;
1595
1596     fb = src_argb0[4 + 0];
1597     fg = src_argb0[4 + 1];
1598     fr = src_argb0[4 + 2];
1599     a = src_argb0[4 + 3];
1600     bb = src_argb1[4 + 0];
1601     bg = src_argb1[4 + 1];
1602     br = src_argb1[4 + 2];
1603     dst_argb[4 + 0] = BLEND(fb, bb, a);
1604     dst_argb[4 + 1] = BLEND(fg, bg, a);
1605     dst_argb[4 + 2] = BLEND(fr, br, a);
1606     dst_argb[4 + 3] = 255u;
1607     src_argb0 += 8;
1608     src_argb1 += 8;
1609     dst_argb += 8;
1610   }
1611
1612   if (width & 1) {
1613     uint32 fb = src_argb0[0];
1614     uint32 fg = src_argb0[1];
1615     uint32 fr = src_argb0[2];
1616     uint32 a = src_argb0[3];
1617     uint32 bb = src_argb1[0];
1618     uint32 bg = src_argb1[1];
1619     uint32 br = src_argb1[2];
1620     dst_argb[0] = BLEND(fb, bb, a);
1621     dst_argb[1] = BLEND(fg, bg, a);
1622     dst_argb[2] = BLEND(fr, br, a);
1623     dst_argb[3] = 255u;
1624   }
1625 }
1626 #undef BLEND
1627 #define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
1628
1629 // Multiply source RGB by alpha and store to destination.
1630 // This code mimics the SSSE3 version for better testability.
1631 void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
1632   for (int i = 0; i < width - 1; i += 2) {
1633     uint32 b = src_argb[0];
1634     uint32 g = src_argb[1];
1635     uint32 r = src_argb[2];
1636     uint32 a = src_argb[3];
1637     dst_argb[0] = ATTENUATE(b, a);
1638     dst_argb[1] = ATTENUATE(g, a);
1639     dst_argb[2] = ATTENUATE(r, a);
1640     dst_argb[3] = a;
1641     b = src_argb[4];
1642     g = src_argb[5];
1643     r = src_argb[6];
1644     a = src_argb[7];
1645     dst_argb[4] = ATTENUATE(b, a);
1646     dst_argb[5] = ATTENUATE(g, a);
1647     dst_argb[6] = ATTENUATE(r, a);
1648     dst_argb[7] = a;
1649     src_argb += 8;
1650     dst_argb += 8;
1651   }
1652
1653   if (width & 1) {
1654     const uint32 b = src_argb[0];
1655     const uint32 g = src_argb[1];
1656     const uint32 r = src_argb[2];
1657     const uint32 a = src_argb[3];
1658     dst_argb[0] = ATTENUATE(b, a);
1659     dst_argb[1] = ATTENUATE(g, a);
1660     dst_argb[2] = ATTENUATE(r, a);
1661     dst_argb[3] = a;
1662   }
1663 }
1664 #undef ATTENUATE
1665
1666 // Divide source RGB by alpha and store to destination.
1667 // b = (b * 255 + (a / 2)) / a;
1668 // g = (g * 255 + (a / 2)) / a;
1669 // r = (r * 255 + (a / 2)) / a;
1670 // Reciprocal method is off by 1 on some values. ie 125
1671 // 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
1672 #define T(a) 0x01000000 + (0x10000 / a)
1673 uint32 fixed_invtbl8[256] = {
1674   0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
1675   T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
1676   T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
1677   T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
1678   T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
1679   T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
1680   T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
1681   T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
1682   T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
1683   T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
1684   T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
1685   T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
1686   T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
1687   T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
1688   T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
1689   T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
1690   T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
1691   T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
1692   T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
1693   T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
1694   T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
1695   T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
1696   T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
1697   T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
1698   T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
1699   T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
1700   T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
1701   T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
1702   T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
1703   T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
1704   T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
1705   T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 };
1706 #undef T
1707
1708 void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
1709   for (int i = 0; i < width; ++i) {
1710     uint32 b = src_argb[0];
1711     uint32 g = src_argb[1];
1712     uint32 r = src_argb[2];
1713     const uint32 a = src_argb[3];
1714     const uint32 ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point
1715     b = (b * ia) >> 8;
1716     g = (g * ia) >> 8;
1717     r = (r * ia) >> 8;
1718     // Clamping should not be necessary but is free in assembly.
1719     dst_argb[0] = clamp255(b);
1720     dst_argb[1] = clamp255(g);
1721     dst_argb[2] = clamp255(r);
1722     dst_argb[3] = a;
1723     src_argb += 4;
1724     dst_argb += 4;
1725   }
1726 }
1727
1728 void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
1729                                const int32* previous_cumsum, int width) {
1730   int32 row_sum[4] = {0, 0, 0, 0};
1731   for (int x = 0; x < width; ++x) {
1732     row_sum[0] += row[x * 4 + 0];
1733     row_sum[1] += row[x * 4 + 1];
1734     row_sum[2] += row[x * 4 + 2];
1735     row_sum[3] += row[x * 4 + 3];
1736     cumsum[x * 4 + 0] = row_sum[0]  + previous_cumsum[x * 4 + 0];
1737     cumsum[x * 4 + 1] = row_sum[1]  + previous_cumsum[x * 4 + 1];
1738     cumsum[x * 4 + 2] = row_sum[2]  + previous_cumsum[x * 4 + 2];
1739     cumsum[x * 4 + 3] = row_sum[3]  + previous_cumsum[x * 4 + 3];
1740   }
1741 }
1742
1743 void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl,
1744                                 int w, int area, uint8* dst, int count) {
1745   float ooa = 1.0f / area;
1746   for (int i = 0; i < count; ++i) {
1747     dst[0] = static_cast<uint8>((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
1748     dst[1] = static_cast<uint8>((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
1749     dst[2] = static_cast<uint8>((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
1750     dst[3] = static_cast<uint8>((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
1751     dst += 4;
1752     tl += 4;
1753     bl += 4;
1754   }
1755 }
1756
1757 // Copy pixels from rotated source to destination row with a slope.
1758 LIBYUV_API
1759 void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
1760                      uint8* dst_argb, const float* uv_dudv, int width) {
1761   // Render a row of pixels from source into a buffer.
1762   float uv[2];
1763   uv[0] = uv_dudv[0];
1764   uv[1] = uv_dudv[1];
1765   for (int i = 0; i < width; ++i) {
1766     int x = static_cast<int>(uv[0]);
1767     int y = static_cast<int>(uv[1]);
1768     *reinterpret_cast<uint32*>(dst_argb) =
1769         *reinterpret_cast<const uint32*>(src_argb + y * src_argb_stride +
1770                                          x * 4);
1771     dst_argb += 4;
1772     uv[0] += uv_dudv[2];
1773     uv[1] += uv_dudv[3];
1774   }
1775 }
1776
1777 // C version 2x2 -> 2x1.
1778 void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
1779                       ptrdiff_t src_stride,
1780                       int width, int source_y_fraction) {
1781   int y1_fraction = source_y_fraction;
1782   int y0_fraction = 256 - y1_fraction;
1783   const uint8* src_ptr1 = src_ptr + src_stride;
1784
1785   for (int x = 0; x < width - 1; x += 2) {
1786     dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
1787     dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
1788     src_ptr += 2;
1789     src_ptr1 += 2;
1790     dst_ptr += 2;
1791   }
1792   if (width & 1) {
1793     dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
1794   }
1795 }
1796
1797 // Blend 2 rows into 1 for conversions such as I422ToI420.
1798 void HalfRow_C(const uint8* src_uv, int src_uv_stride,
1799                uint8* dst_uv, int pix) {
1800   for (int x = 0; x < pix; ++x) {
1801     dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
1802   }
1803 }
1804
1805 // Select 2 channels from ARGB on alternating pixels.  e.g.  BGBGBGBG
1806 void ARGBToBayerRow_C(const uint8* src_argb,
1807                       uint8* dst_bayer, uint32 selector, int pix) {
1808   int index0 = selector & 0xff;
1809   int index1 = (selector >> 8) & 0xff;
1810   // Copy a row of Bayer.
1811   for (int x = 0; x < pix - 1; x += 2) {
1812     dst_bayer[0] = src_argb[index0];
1813     dst_bayer[1] = src_argb[index1];
1814     src_argb += 8;
1815     dst_bayer += 2;
1816   }
1817   if (pix & 1) {
1818     dst_bayer[0] = src_argb[index0];
1819   }
1820 }
1821
1822 // Use first 4 shuffler values to reorder ARGB channels.
1823 void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
1824                       const uint8* shuffler, int pix) {
1825   int index0 = shuffler[0];
1826   int index1 = shuffler[1];
1827   int index2 = shuffler[2];
1828   int index3 = shuffler[3];
1829   // Shuffle a row of ARGB.
1830   for (int x = 0; x < pix; ++x) {
1831     // To support in-place conversion.
1832     uint8 b = src_argb[index0];
1833     uint8 g = src_argb[index1];
1834     uint8 r = src_argb[index2];
1835     uint8 a = src_argb[index3];
1836     dst_argb[0] = b;
1837     dst_argb[1] = g;
1838     dst_argb[2] = r;
1839     dst_argb[3] = a;
1840     src_argb += 4;
1841     dst_argb += 4;
1842   }
1843 }
1844
1845 void I422ToYUY2Row_C(const uint8* src_y,
1846                      const uint8* src_u,
1847                      const uint8* src_v,
1848                      uint8* dst_frame, int width) {
1849     for (int x = 0; x < width - 1; x += 2) {
1850       dst_frame[0] = src_y[0];
1851       dst_frame[1] = src_u[0];
1852       dst_frame[2] = src_y[1];
1853       dst_frame[3] = src_v[0];
1854       dst_frame += 4;
1855       src_y += 2;
1856       src_u += 1;
1857       src_v += 1;
1858     }
1859     if (width & 1) {
1860       dst_frame[0] = src_y[0];
1861       dst_frame[1] = src_u[0];
1862       dst_frame[2] = src_y[0];  // duplicate last y
1863       dst_frame[3] = src_v[0];
1864     }
1865 }
1866
1867 void I422ToUYVYRow_C(const uint8* src_y,
1868                      const uint8* src_u,
1869                      const uint8* src_v,
1870                      uint8* dst_frame, int width) {
1871     for (int x = 0; x < width - 1; x += 2) {
1872       dst_frame[0] = src_u[0];
1873       dst_frame[1] = src_y[0];
1874       dst_frame[2] = src_v[0];
1875       dst_frame[3] = src_y[1];
1876       dst_frame += 4;
1877       src_y += 2;
1878       src_u += 1;
1879       src_v += 1;
1880     }
1881     if (width & 1) {
1882       dst_frame[0] = src_u[0];
1883       dst_frame[1] = src_y[0];
1884       dst_frame[2] = src_v[0];
1885       dst_frame[3] = src_y[0];  // duplicate last y
1886     }
1887 }
1888
1889 #if !defined(LIBYUV_DISABLE_X86)
1890 // row_win.cc has asm version, but GCC uses 2 step wrapper.  5% slower.
1891 // TODO(fbarchard): Handle width > kMaxStride here instead of calling code.
1892 #if defined(__x86_64__) || defined(__i386__)
1893 void I422ToRGB565Row_SSSE3(const uint8* src_y,
1894                            const uint8* src_u,
1895                            const uint8* src_v,
1896                            uint8* rgb_buf,
1897                            int width) {
1898   SIMD_ALIGNED(uint8 row[kMaxStride]);
1899   I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
1900   ARGBToRGB565Row_SSE2(row, rgb_buf, width);
1901 }
1902 #endif  // defined(__x86_64__) || defined(__i386__)
1903
1904 #if defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
1905 void I422ToARGB1555Row_SSSE3(const uint8* src_y,
1906                              const uint8* src_u,
1907                              const uint8* src_v,
1908                              uint8* rgb_buf,
1909                              int width) {
1910   SIMD_ALIGNED(uint8 row[kMaxStride]);
1911   I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
1912   ARGBToARGB1555Row_SSE2(row, rgb_buf, width);
1913 }
1914
1915 void I422ToARGB4444Row_SSSE3(const uint8* src_y,
1916                              const uint8* src_u,
1917                              const uint8* src_v,
1918                              uint8* rgb_buf,
1919                              int width) {
1920   SIMD_ALIGNED(uint8 row[kMaxStride]);
1921   I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
1922   ARGBToARGB4444Row_SSE2(row, rgb_buf, width);
1923 }
1924
1925 void NV12ToRGB565Row_SSSE3(const uint8* src_y,
1926                            const uint8* src_uv,
1927                            uint8* dst_rgb565,
1928                            int width) {
1929   SIMD_ALIGNED(uint8 row[kMaxStride]);
1930   NV12ToARGBRow_SSSE3(src_y, src_uv, row, width);
1931   ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
1932 }
1933
1934 void NV21ToRGB565Row_SSSE3(const uint8* src_y,
1935                            const uint8* src_vu,
1936                            uint8* dst_rgb565,
1937                            int width) {
1938   SIMD_ALIGNED(uint8 row[kMaxStride]);
1939   NV21ToARGBRow_SSSE3(src_y, src_vu, row, width);
1940   ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
1941 }
1942
1943 void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
1944                          uint8* dst_argb,
1945                          int width) {
1946   SIMD_ALIGNED(uint8 row_y[kMaxStride]);
1947   SIMD_ALIGNED(uint8 row_u[kMaxStride / 2]);
1948   SIMD_ALIGNED(uint8 row_v[kMaxStride / 2]);
1949   YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, width);
1950   YUY2ToYRow_SSE2(src_yuy2, row_y, width);
1951   I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
1952 }
1953
1954 void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2,
1955                                    uint8* dst_argb,
1956                                    int width) {
1957   SIMD_ALIGNED(uint8 row_y[kMaxStride]);
1958   SIMD_ALIGNED(uint8 row_u[kMaxStride / 2]);
1959   SIMD_ALIGNED(uint8 row_v[kMaxStride / 2]);
1960   YUY2ToUV422Row_Unaligned_SSE2(src_yuy2, row_u, row_v, width);
1961   YUY2ToYRow_Unaligned_SSE2(src_yuy2, row_y, width);
1962   I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
1963 }
1964
1965 void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
1966                          uint8* dst_argb,
1967                          int width) {
1968   SIMD_ALIGNED(uint8 row_y[kMaxStride]);
1969   SIMD_ALIGNED(uint8 row_u[kMaxStride / 2]);
1970   SIMD_ALIGNED(uint8 row_v[kMaxStride / 2]);
1971   UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, width);
1972   UYVYToYRow_SSE2(src_uyvy, row_y, width);
1973   I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
1974 }
1975
1976 void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy,
1977                                    uint8* dst_argb,
1978                                    int width) {
1979   SIMD_ALIGNED(uint8 row_y[kMaxStride]);
1980   SIMD_ALIGNED(uint8 row_u[kMaxStride / 2]);
1981   SIMD_ALIGNED(uint8 row_v[kMaxStride / 2]);
1982   UYVYToUV422Row_Unaligned_SSE2(src_uyvy, row_u, row_v, width);
1983   UYVYToYRow_Unaligned_SSE2(src_uyvy, row_y, width);
1984   I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
1985 }
1986
1987 #endif  // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
1988 #endif  // !defined(LIBYUV_DISABLE_X86)
1989 #undef clamp0
1990 #undef clamp255
1991
1992 #ifdef __cplusplus
1993 }  // extern "C"
1994 }  // namespace libyuv
1995 #endif