src/third_party/libyuv/source/row_common.cc

   1 /*
   2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS. All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include "libyuv/row.h"
  12
  13 #include <string.h>  // For memcpy and memset.
  14
  15 #include "libyuv/basic_types.h"
  16
  17 #ifdef __cplusplus
  18 namespace libyuv {
  19 extern "C" {
  20 #endif
  21
  22 // llvm x86 is poor at ternary operator, so use branchless min/max.
  23
  24 #define USE_BRANCHLESS 1
  25 #if USE_BRANCHLESS
  26 static __inline int32 clamp0(int32 v) {
  27   return ((-(v) >> 31) & (v));
  28 }
  29
  30 static __inline int32 clamp255(int32 v) {
  31   return (((255 - (v)) >> 31) | (v)) & 255;
  32 }
  33
  34 static __inline uint32 Clamp(int32 val) {
  35   int v = clamp0(val);
  36   return (uint32)(clamp255(v));
  37 }
  38
  39 static __inline uint32 Abs(int32 v) {
  40   int m = v >> 31;
  41   return (v + m) ^ m;
  42 }
  43 #else  // USE_BRANCHLESS
  44 static __inline int32 clamp0(int32 v) {
  45   return (v < 0) ? 0 : v;
  46 }
  47
  48 static __inline int32 clamp255(int32 v) {
  49   return (v > 255) ? 255 : v;
  50 }
  51
  52 static __inline uint32 Clamp(int32 val) {
  53   int v = clamp0(val);
  54   return (uint32)(clamp255(v));
  55 }
  56
  57 static __inline uint32 Abs(int32 v) {
  58   return (v < 0) ? -v : v;
  59 }
  60 #endif  // USE_BRANCHLESS
  61
  62 #ifdef LIBYUV_LITTLE_ENDIAN
  63 #define WRITEWORD(p, v) *(uint32*)(p) = v
  64 #else
  65 static inline void WRITEWORD(uint8* p, uint32 v) {
  66   p[0] = (uint8)(v & 255);
  67   p[1] = (uint8)((v >> 8) & 255);
  68   p[2] = (uint8)((v >> 16) & 255);
  69   p[3] = (uint8)((v >> 24) & 255);
  70 }
  71 #endif
  72
  73 void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
  74   int x;
  75   for (x = 0; x < width; ++x) {
  76     uint8 b = src_rgb24[0];
  77     uint8 g = src_rgb24[1];
  78     uint8 r = src_rgb24[2];
  79     dst_argb[0] = b;
  80     dst_argb[1] = g;
  81     dst_argb[2] = r;
  82     dst_argb[3] = 255u;
  83     dst_argb += 4;
  84     src_rgb24 += 3;
  85   }
  86 }
  87
  88 void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
  89   int x;
  90   for (x = 0; x < width; ++x) {
  91     uint8 r = src_raw[0];
  92     uint8 g = src_raw[1];
  93     uint8 b = src_raw[2];
  94     dst_argb[0] = b;
  95     dst_argb[1] = g;
  96     dst_argb[2] = r;
  97     dst_argb[3] = 255u;
  98     dst_argb += 4;
  99     src_raw += 3;
 100   }
 101 }
 102
 103 void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {
 104   int x;
 105   for (x = 0; x < width; ++x) {
 106     uint8 b = src_rgb565[0] & 0x1f;
 107     uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
 108     uint8 r = src_rgb565[1] >> 3;
 109     dst_argb[0] = (b << 3) | (b >> 2);
 110     dst_argb[1] = (g << 2) | (g >> 4);
 111     dst_argb[2] = (r << 3) | (r >> 2);
 112     dst_argb[3] = 255u;
 113     dst_argb += 4;
 114     src_rgb565 += 2;
 115   }
 116 }
 117
 118 void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb,
 119                          int width) {
 120   int x;
 121   for (x = 0; x < width; ++x) {
 122     uint8 b = src_argb1555[0] & 0x1f;
 123     uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
 124     uint8 r = (src_argb1555[1] & 0x7c) >> 2;
 125     uint8 a = src_argb1555[1] >> 7;
 126     dst_argb[0] = (b << 3) | (b >> 2);
 127     dst_argb[1] = (g << 3) | (g >> 2);
 128     dst_argb[2] = (r << 3) | (r >> 2);
 129     dst_argb[3] = -a;
 130     dst_argb += 4;
 131     src_argb1555 += 2;
 132   }
 133 }
 134
 135 void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb,
 136                          int width) {
 137   int x;
 138   for (x = 0; x < width; ++x) {
 139     uint8 b = src_argb4444[0] & 0x0f;
 140     uint8 g = src_argb4444[0] >> 4;
 141     uint8 r = src_argb4444[1] & 0x0f;
 142     uint8 a = src_argb4444[1] >> 4;
 143     dst_argb[0] = (b << 4) | b;
 144     dst_argb[1] = (g << 4) | g;
 145     dst_argb[2] = (r << 4) | r;
 146     dst_argb[3] = (a << 4) | a;
 147     dst_argb += 4;
 148     src_argb4444 += 2;
 149   }
 150 }
 151
 152 void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
 153   int x;
 154   for (x = 0; x < width; ++x) {
 155     uint8 b = src_argb[0];
 156     uint8 g = src_argb[1];
 157     uint8 r = src_argb[2];
 158     dst_rgb[0] = b;
 159     dst_rgb[1] = g;
 160     dst_rgb[2] = r;
 161     dst_rgb += 3;
 162     src_argb += 4;
 163   }
 164 }
 165
 166 void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
 167   int x;
 168   for (x = 0; x < width; ++x) {
 169     uint8 b = src_argb[0];
 170     uint8 g = src_argb[1];
 171     uint8 r = src_argb[2];
 172     dst_rgb[0] = r;
 173     dst_rgb[1] = g;
 174     dst_rgb[2] = b;
 175     dst_rgb += 3;
 176     src_argb += 4;
 177   }
 178 }
 179
 180 void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
 181   int x;
 182   for (x = 0; x < width - 1; x += 2) {
 183     uint8 b0 = src_argb[0] >> 3;
 184     uint8 g0 = src_argb[1] >> 2;
 185     uint8 r0 = src_argb[2] >> 3;
 186     uint8 b1 = src_argb[4] >> 3;
 187     uint8 g1 = src_argb[5] >> 2;
 188     uint8 r1 = src_argb[6] >> 3;
 189     WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
 190               (b1 << 16) | (g1 << 21) | (r1 << 27));
 191     dst_rgb += 4;
 192     src_argb += 8;
 193   }
 194   if (width & 1) {
 195     uint8 b0 = src_argb[0] >> 3;
 196     uint8 g0 = src_argb[1] >> 2;
 197     uint8 r0 = src_argb[2] >> 3;
 198     *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
 199   }
 200 }
 201
 202 void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
 203   int x;
 204   for (x = 0; x < width - 1; x += 2) {
 205     uint8 b0 = src_argb[0] >> 3;
 206     uint8 g0 = src_argb[1] >> 3;
 207     uint8 r0 = src_argb[2] >> 3;
 208     uint8 a0 = src_argb[3] >> 7;
 209     uint8 b1 = src_argb[4] >> 3;
 210     uint8 g1 = src_argb[5] >> 3;
 211     uint8 r1 = src_argb[6] >> 3;
 212     uint8 a1 = src_argb[7] >> 7;
 213     *(uint32*)(dst_rgb) =
 214         b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
 215         (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
 216     dst_rgb += 4;
 217     src_argb += 8;
 218   }
 219   if (width & 1) {
 220     uint8 b0 = src_argb[0] >> 3;
 221     uint8 g0 = src_argb[1] >> 3;
 222     uint8 r0 = src_argb[2] >> 3;
 223     uint8 a0 = src_argb[3] >> 7;
 224     *(uint16*)(dst_rgb) =
 225         b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
 226   }
 227 }
 228
 229 void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
 230   int x;
 231   for (x = 0; x < width - 1; x += 2) {
 232     uint8 b0 = src_argb[0] >> 4;
 233     uint8 g0 = src_argb[1] >> 4;
 234     uint8 r0 = src_argb[2] >> 4;
 235     uint8 a0 = src_argb[3] >> 4;
 236     uint8 b1 = src_argb[4] >> 4;
 237     uint8 g1 = src_argb[5] >> 4;
 238     uint8 r1 = src_argb[6] >> 4;
 239     uint8 a1 = src_argb[7] >> 4;
 240     *(uint32*)(dst_rgb) =
 241         b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
 242         (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
 243     dst_rgb += 4;
 244     src_argb += 8;
 245   }
 246   if (width & 1) {
 247     uint8 b0 = src_argb[0] >> 4;
 248     uint8 g0 = src_argb[1] >> 4;
 249     uint8 r0 = src_argb[2] >> 4;
 250     uint8 a0 = src_argb[3] >> 4;
 251     *(uint16*)(dst_rgb) =
 252         b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
 253   }
 254 }
 255
 256 static __inline int RGBToY(uint8 r, uint8 g, uint8 b) {
 257   return (66 * r + 129 * g +  25 * b + 0x1080) >> 8;
 258 }
 259
 260 static __inline int RGBToU(uint8 r, uint8 g, uint8 b) {
 261   return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
 262 }
 263 static __inline int RGBToV(uint8 r, uint8 g, uint8 b) {
 264   return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
 265 }
 266
 267 #define MAKEROWY(NAME, R, G, B, BPP) \
 268 void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {       \
 269   int x;                                                                       \
 270   for (x = 0; x < width; ++x) {                                                \
 271     dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);               \
 272     src_argb0 += BPP;                                                          \
 273     dst_y += 1;                                                                \
 274   }                                                                            \
 275 }                                                                              \
 276 void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb,              \
 277                        uint8* dst_u, uint8* dst_v, int width) {                \
 278   const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
 279   int x;                                                                       \
 280   for (x = 0; x < width - 1; x += 2) {                                         \
 281     uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] +                              \
 282                src_rgb1[B] + src_rgb1[B + BPP]) >> 2;                          \
 283     uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] +                              \
 284                src_rgb1[G] + src_rgb1[G + BPP]) >> 2;                          \
 285     uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] +                              \
 286                src_rgb1[R] + src_rgb1[R + BPP]) >> 2;                          \
 287     dst_u[0] = RGBToU(ar, ag, ab);                                             \
 288     dst_v[0] = RGBToV(ar, ag, ab);                                             \
 289     src_rgb0 += BPP * 2;                                                       \
 290     src_rgb1 += BPP * 2;                                                       \
 291     dst_u += 1;                                                                \
 292     dst_v += 1;                                                                \
 293   }                                                                            \
 294   if (width & 1) {                                                             \
 295     uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                               \
 296     uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                               \
 297     uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                               \
 298     dst_u[0] = RGBToU(ar, ag, ab);                                             \
 299     dst_v[0] = RGBToV(ar, ag, ab);                                             \
 300   }                                                                            \
 301 }
 302
 303 MAKEROWY(ARGB, 2, 1, 0, 4)
 304 MAKEROWY(BGRA, 1, 2, 3, 4)
 305 MAKEROWY(ABGR, 0, 1, 2, 4)
 306 MAKEROWY(RGBA, 3, 2, 1, 4)
 307 MAKEROWY(RGB24, 2, 1, 0, 3)
 308 MAKEROWY(RAW, 0, 1, 2, 3)
 309 #undef MAKEROWY
 310
 311 // JPeg uses a variation on BT.601-1 full range
 312 // y =  0.29900 * r + 0.58700 * g + 0.11400 * b
 313 // u = -0.16874 * r - 0.33126 * g + 0.50000 * b  + center
 314 // v =  0.50000 * r - 0.41869 * g - 0.08131 * b  + center
 315 // BT.601 Mpeg range uses:
 316 // b 0.1016 * 255 = 25.908 = 25
 317 // g 0.5078 * 255 = 129.489 = 129
 318 // r 0.2578 * 255 = 65.739 = 66
 319 // JPeg 8 bit Y (not used):
 320 // b 0.11400 * 256 = 29.184 = 29
 321 // g 0.58700 * 256 = 150.272 = 150
 322 // r 0.29900 * 256 = 76.544 = 77
 323 // JPeg 7 bit Y:
 324 // b 0.11400 * 128 = 14.592 = 15
 325 // g 0.58700 * 128 = 75.136 = 75
 326 // r 0.29900 * 128 = 38.272 = 38
 327 // JPeg 8 bit U:
 328 // b  0.50000 * 255 = 127.5 = 127
 329 // g -0.33126 * 255 = -84.4713 = -84
 330 // r -0.16874 * 255 = -43.0287 = -43
 331 // JPeg 8 bit V:
 332 // b -0.08131 * 255 = -20.73405 = -20
 333 // g -0.41869 * 255 = -106.76595 = -107
 334 // r  0.50000 * 255 = 127.5 = 127
 335
 336 static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) {
 337   return (38 * r + 75 * g +  15 * b + 64) >> 7;
 338 }
 339
 340 static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) {
 341   return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
 342 }
 343 static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) {
 344   return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
 345 }
 346
 347 #define AVGB(a, b) (((a) + (b) + 1) >> 1)
 348
 349 #define MAKEROWYJ(NAME, R, G, B, BPP) \
 350 void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) {      \
 351   int x;                                                                       \
 352   for (x = 0; x < width; ++x) {                                                \
 353     dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);              \
 354     src_argb0 += BPP;                                                          \
 355     dst_y += 1;                                                                \
 356   }                                                                            \
 357 }                                                                              \
 358 void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb,             \
 359                         uint8* dst_u, uint8* dst_v, int width) {               \
 360   const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
 361   int x;                                                                       \
 362   for (x = 0; x < width - 1; x += 2) {                                         \
 363     uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                            \
 364                     AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));               \
 365     uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                            \
 366                     AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));               \
 367     uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                            \
 368                     AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));               \
 369     dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
 370     dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
 371     src_rgb0 += BPP * 2;                                                       \
 372     src_rgb1 += BPP * 2;                                                       \
 373     dst_u += 1;                                                                \
 374     dst_v += 1;                                                                \
 375   }                                                                            \
 376   if (width & 1) {                                                             \
 377     uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]);                                 \
 378     uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]);                                 \
 379     uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]);                                 \
 380     dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
 381     dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
 382   }                                                                            \
 383 }
 384
 385 MAKEROWYJ(ARGB, 2, 1, 0, 4)
 386 #undef MAKEROWYJ
 387
 388 void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
 389   int x;
 390   for (x = 0; x < width; ++x) {
 391     uint8 b = src_rgb565[0] & 0x1f;
 392     uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
 393     uint8 r = src_rgb565[1] >> 3;
 394     b = (b << 3) | (b >> 2);
 395     g = (g << 2) | (g >> 4);
 396     r = (r << 3) | (r >> 2);
 397     dst_y[0] = RGBToY(r, g, b);
 398     src_rgb565 += 2;
 399     dst_y += 1;
 400   }
 401 }
 402
 403 void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) {
 404   int x;
 405   for (x = 0; x < width; ++x) {
 406     uint8 b = src_argb1555[0] & 0x1f;
 407     uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
 408     uint8 r = (src_argb1555[1] & 0x7c) >> 2;
 409     b = (b << 3) | (b >> 2);
 410     g = (g << 3) | (g >> 2);
 411     r = (r << 3) | (r >> 2);
 412     dst_y[0] = RGBToY(r, g, b);
 413     src_argb1555 += 2;
 414     dst_y += 1;
 415   }
 416 }
 417
 418 void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) {
 419   int x;
 420   for (x = 0; x < width; ++x) {
 421     uint8 b = src_argb4444[0] & 0x0f;
 422     uint8 g = src_argb4444[0] >> 4;
 423     uint8 r = src_argb4444[1] & 0x0f;
 424     b = (b << 4) | b;
 425     g = (g << 4) | g;
 426     r = (r << 4) | r;
 427     dst_y[0] = RGBToY(r, g, b);
 428     src_argb4444 += 2;
 429     dst_y += 1;
 430   }
 431 }
 432
 433 void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
 434                      uint8* dst_u, uint8* dst_v, int width) {
 435   const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565;
 436   int x;
 437   for (x = 0; x < width - 1; x += 2) {
 438     uint8 b0 = src_rgb565[0] & 0x1f;
 439     uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
 440     uint8 r0 = src_rgb565[1] >> 3;
 441     uint8 b1 = src_rgb565[2] & 0x1f;
 442     uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
 443     uint8 r1 = src_rgb565[3] >> 3;
 444     uint8 b2 = next_rgb565[0] & 0x1f;
 445     uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
 446     uint8 r2 = next_rgb565[1] >> 3;
 447     uint8 b3 = next_rgb565[2] & 0x1f;
 448     uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
 449     uint8 r3 = next_rgb565[3] >> 3;
 450     uint8 b = (b0 + b1 + b2 + b3);  // 565 * 4 = 787.
 451     uint8 g = (g0 + g1 + g2 + g3);
 452     uint8 r = (r0 + r1 + r2 + r3);
 453     b = (b << 1) | (b >> 6);  // 787 -> 888.
 454     r = (r << 1) | (r >> 6);
 455     dst_u[0] = RGBToU(r, g, b);
 456     dst_v[0] = RGBToV(r, g, b);
 457     src_rgb565 += 4;
 458     next_rgb565 += 4;
 459     dst_u += 1;
 460     dst_v += 1;
 461   }
 462   if (width & 1) {
 463     uint8 b0 = src_rgb565[0] & 0x1f;
 464     uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
 465     uint8 r0 = src_rgb565[1] >> 3;
 466     uint8 b2 = next_rgb565[0] & 0x1f;
 467     uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
 468     uint8 r2 = next_rgb565[1] >> 3;
 469     uint8 b = (b0 + b2);  // 565 * 2 = 676.
 470     uint8 g = (g0 + g2);
 471     uint8 r = (r0 + r2);
 472     b = (b << 2) | (b >> 4);  // 676 -> 888
 473     g = (g << 1) | (g >> 6);
 474     r = (r << 2) | (r >> 4);
 475     dst_u[0] = RGBToU(r, g, b);
 476     dst_v[0] = RGBToV(r, g, b);
 477   }
 478 }
 479
 480 void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
 481                        uint8* dst_u, uint8* dst_v, int width) {
 482   const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555;
 483   int x;
 484   for (x = 0; x < width - 1; x += 2) {
 485     uint8 b0 = src_argb1555[0] & 0x1f;
 486     uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
 487     uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
 488     uint8 b1 = src_argb1555[2] & 0x1f;
 489     uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
 490     uint8 r1 = (src_argb1555[3] & 0x7c) >> 2;
 491     uint8 b2 = next_argb1555[0] & 0x1f;
 492     uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
 493     uint8 r2 = (next_argb1555[1] & 0x7c) >> 2;
 494     uint8 b3 = next_argb1555[2] & 0x1f;
 495     uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
 496     uint8 r3 = (next_argb1555[3] & 0x7c) >> 2;
 497     uint8 b = (b0 + b1 + b2 + b3);  // 555 * 4 = 777.
 498     uint8 g = (g0 + g1 + g2 + g3);
 499     uint8 r = (r0 + r1 + r2 + r3);
 500     b = (b << 1) | (b >> 6);  // 777 -> 888.
 501     g = (g << 1) | (g >> 6);
 502     r = (r << 1) | (r >> 6);
 503     dst_u[0] = RGBToU(r, g, b);
 504     dst_v[0] = RGBToV(r, g, b);
 505     src_argb1555 += 4;
 506     next_argb1555 += 4;
 507     dst_u += 1;
 508     dst_v += 1;
 509   }
 510   if (width & 1) {
 511     uint8 b0 = src_argb1555[0] & 0x1f;
 512     uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
 513     uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
 514     uint8 b2 = next_argb1555[0] & 0x1f;
 515     uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
 516     uint8 r2 = next_argb1555[1] >> 3;
 517     uint8 b = (b0 + b2);  // 555 * 2 = 666.
 518     uint8 g = (g0 + g2);
 519     uint8 r = (r0 + r2);
 520     b = (b << 2) | (b >> 4);  // 666 -> 888.
 521     g = (g << 2) | (g >> 4);
 522     r = (r << 2) | (r >> 4);
 523     dst_u[0] = RGBToU(r, g, b);
 524     dst_v[0] = RGBToV(r, g, b);
 525   }
 526 }
 527
 528 void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
 529                        uint8* dst_u, uint8* dst_v, int width) {
 530   const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444;
 531   int x;
 532   for (x = 0; x < width - 1; x += 2) {
 533     uint8 b0 = src_argb4444[0] & 0x0f;
 534     uint8 g0 = src_argb4444[0] >> 4;
 535     uint8 r0 = src_argb4444[1] & 0x0f;
 536     uint8 b1 = src_argb4444[2] & 0x0f;
 537     uint8 g1 = src_argb4444[2] >> 4;
 538     uint8 r1 = src_argb4444[3] & 0x0f;
 539     uint8 b2 = next_argb4444[0] & 0x0f;
 540     uint8 g2 = next_argb4444[0] >> 4;
 541     uint8 r2 = next_argb4444[1] & 0x0f;
 542     uint8 b3 = next_argb4444[2] & 0x0f;
 543     uint8 g3 = next_argb4444[2] >> 4;
 544     uint8 r3 = next_argb4444[3] & 0x0f;
 545     uint8 b = (b0 + b1 + b2 + b3);  // 444 * 4 = 666.
 546     uint8 g = (g0 + g1 + g2 + g3);
 547     uint8 r = (r0 + r1 + r2 + r3);
 548     b = (b << 2) | (b >> 4);  // 666 -> 888.
 549     g = (g << 2) | (g >> 4);
 550     r = (r << 2) | (r >> 4);
 551     dst_u[0] = RGBToU(r, g, b);
 552     dst_v[0] = RGBToV(r, g, b);
 553     src_argb4444 += 4;
 554     next_argb4444 += 4;
 555     dst_u += 1;
 556     dst_v += 1;
 557   }
 558   if (width & 1) {
 559     uint8 b0 = src_argb4444[0] & 0x0f;
 560     uint8 g0 = src_argb4444[0] >> 4;
 561     uint8 r0 = src_argb4444[1] & 0x0f;
 562     uint8 b2 = next_argb4444[0] & 0x0f;
 563     uint8 g2 = next_argb4444[0] >> 4;
 564     uint8 r2 = next_argb4444[1] & 0x0f;
 565     uint8 b = (b0 + b2);  // 444 * 2 = 555.
 566     uint8 g = (g0 + g2);
 567     uint8 r = (r0 + r2);
 568     b = (b << 3) | (b >> 2);  // 555 -> 888.
 569     g = (g << 3) | (g >> 2);
 570     r = (r << 3) | (r >> 2);
 571     dst_u[0] = RGBToU(r, g, b);
 572     dst_v[0] = RGBToV(r, g, b);
 573   }
 574 }
 575
 576 void ARGBToUV444Row_C(const uint8* src_argb,
 577                       uint8* dst_u, uint8* dst_v, int width) {
 578   int x;
 579   for (x = 0; x < width; ++x) {
 580     uint8 ab = src_argb[0];
 581     uint8 ag = src_argb[1];
 582     uint8 ar = src_argb[2];
 583     dst_u[0] = RGBToU(ar, ag, ab);
 584     dst_v[0] = RGBToV(ar, ag, ab);
 585     src_argb += 4;
 586     dst_u += 1;
 587     dst_v += 1;
 588   }
 589 }
 590
 591 void ARGBToUV422Row_C(const uint8* src_argb,
 592                       uint8* dst_u, uint8* dst_v, int width) {
 593   int x;
 594   for (x = 0; x < width - 1; x += 2) {
 595     uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
 596     uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
 597     uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
 598     dst_u[0] = RGBToU(ar, ag, ab);
 599     dst_v[0] = RGBToV(ar, ag, ab);
 600     src_argb += 8;
 601     dst_u += 1;
 602     dst_v += 1;
 603   }
 604   if (width & 1) {
 605     uint8 ab = src_argb[0];
 606     uint8 ag = src_argb[1];
 607     uint8 ar = src_argb[2];
 608     dst_u[0] = RGBToU(ar, ag, ab);
 609     dst_v[0] = RGBToV(ar, ag, ab);
 610   }
 611 }
 612
 613 void ARGBToUV411Row_C(const uint8* src_argb,
 614                       uint8* dst_u, uint8* dst_v, int width) {
 615   int x;
 616   for (x = 0; x < width - 3; x += 4) {
 617     uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2;
 618     uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2;
 619     uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2;
 620     dst_u[0] = RGBToU(ar, ag, ab);
 621     dst_v[0] = RGBToV(ar, ag, ab);
 622     src_argb += 16;
 623     dst_u += 1;
 624     dst_v += 1;
 625   }
 626   if ((width & 3) == 3) {
 627     uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8]) / 3;
 628     uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9]) / 3;
 629     uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10]) / 3;
 630     dst_u[0] = RGBToU(ar, ag, ab);
 631     dst_v[0] = RGBToV(ar, ag, ab);
 632   } else if ((width & 3) == 2) {
 633     uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
 634     uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
 635     uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
 636     dst_u[0] = RGBToU(ar, ag, ab);
 637     dst_v[0] = RGBToV(ar, ag, ab);
 638   } else if ((width & 3) == 1) {
 639     uint8 ab = src_argb[0];
 640     uint8 ag = src_argb[1];
 641     uint8 ar = src_argb[2];
 642     dst_u[0] = RGBToU(ar, ag, ab);
 643     dst_v[0] = RGBToV(ar, ag, ab);
 644   }
 645 }
 646
 647 void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
 648   int x;
 649   for (x = 0; x < width; ++x) {
 650     uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
 651     dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
 652     dst_argb[3] = src_argb[3];
 653     dst_argb += 4;
 654     src_argb += 4;
 655   }
 656 }
 657
 658 // Convert a row of image to Sepia tone.
 659 void ARGBSepiaRow_C(uint8* dst_argb, int width) {
 660   int x;
 661   for (x = 0; x < width; ++x) {
 662     int b = dst_argb[0];
 663     int g = dst_argb[1];
 664     int r = dst_argb[2];
 665     int sb = (b * 17 + g * 68 + r * 35) >> 7;
 666     int sg = (b * 22 + g * 88 + r * 45) >> 7;
 667     int sr = (b * 24 + g * 98 + r * 50) >> 7;
 668     // b does not over flow. a is preserved from original.
 669     dst_argb[0] = sb;
 670     dst_argb[1] = clamp255(sg);
 671     dst_argb[2] = clamp255(sr);
 672     dst_argb += 4;
 673   }
 674 }
 675
 676 // Apply color matrix to a row of image. Matrix is signed.
 677 // TODO(fbarchard): Consider adding rounding (+32).
 678 void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
 679                           const int8* matrix_argb, int width) {
 680   int x;
 681   for (x = 0; x < width; ++x) {
 682     int b = src_argb[0];
 683     int g = src_argb[1];
 684     int r = src_argb[2];
 685     int a = src_argb[3];
 686     int sb = (b * matrix_argb[0] + g * matrix_argb[1] +
 687               r * matrix_argb[2] + a * matrix_argb[3]) >> 6;
 688     int sg = (b * matrix_argb[4] + g * matrix_argb[5] +
 689               r * matrix_argb[6] + a * matrix_argb[7]) >> 6;
 690     int sr = (b * matrix_argb[8] + g * matrix_argb[9] +
 691               r * matrix_argb[10] + a * matrix_argb[11]) >> 6;
 692     int sa = (b * matrix_argb[12] + g * matrix_argb[13] +
 693               r * matrix_argb[14] + a * matrix_argb[15]) >> 6;
 694     dst_argb[0] = Clamp(sb);
 695     dst_argb[1] = Clamp(sg);
 696     dst_argb[2] = Clamp(sr);
 697     dst_argb[3] = Clamp(sa);
 698     src_argb += 4;
 699     dst_argb += 4;
 700   }
 701 }
 702
 703 // Apply color table to a row of image.
 704 void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
 705   int x;
 706   for (x = 0; x < width; ++x) {
 707     int b = dst_argb[0];
 708     int g = dst_argb[1];
 709     int r = dst_argb[2];
 710     int a = dst_argb[3];
 711     dst_argb[0] = table_argb[b * 4 + 0];
 712     dst_argb[1] = table_argb[g * 4 + 1];
 713     dst_argb[2] = table_argb[r * 4 + 2];
 714     dst_argb[3] = table_argb[a * 4 + 3];
 715     dst_argb += 4;
 716   }
 717 }
 718
 719 // Apply color table to a row of image.
 720 void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
 721   int x;
 722   for (x = 0; x < width; ++x) {
 723     int b = dst_argb[0];
 724     int g = dst_argb[1];
 725     int r = dst_argb[2];
 726     dst_argb[0] = table_argb[b * 4 + 0];
 727     dst_argb[1] = table_argb[g * 4 + 1];
 728     dst_argb[2] = table_argb[r * 4 + 2];
 729     dst_argb += 4;
 730   }
 731 }
 732
 733 void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
 734                        int interval_offset, int width) {
 735   int x;
 736   for (x = 0; x < width; ++x) {
 737     int b = dst_argb[0];
 738     int g = dst_argb[1];
 739     int r = dst_argb[2];
 740     dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
 741     dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;
 742     dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;
 743     dst_argb += 4;
 744   }
 745 }
 746
 747 #define REPEAT8(v) (v) | ((v) << 8)
 748 #define SHADE(f, v) v * f >> 24
 749
 750 void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
 751                     uint32 value) {
 752   const uint32 b_scale = REPEAT8(value & 0xff);
 753   const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
 754   const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
 755   const uint32 a_scale = REPEAT8(value >> 24);
 756
 757   int i;
 758   for (i = 0; i < width; ++i) {
 759     const uint32 b = REPEAT8(src_argb[0]);
 760     const uint32 g = REPEAT8(src_argb[1]);
 761     const uint32 r = REPEAT8(src_argb[2]);
 762     const uint32 a = REPEAT8(src_argb[3]);
 763     dst_argb[0] = SHADE(b, b_scale);
 764     dst_argb[1] = SHADE(g, g_scale);
 765     dst_argb[2] = SHADE(r, r_scale);
 766     dst_argb[3] = SHADE(a, a_scale);
 767     src_argb += 4;
 768     dst_argb += 4;
 769   }
 770 }
 771 #undef REPEAT8
 772 #undef SHADE
 773
 774 #define REPEAT8(v) (v) | ((v) << 8)
 775 #define SHADE(f, v) v * f >> 16
 776
 777 void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
 778                        uint8* dst_argb, int width) {
 779   int i;
 780   for (i = 0; i < width; ++i) {
 781     const uint32 b = REPEAT8(src_argb0[0]);
 782     const uint32 g = REPEAT8(src_argb0[1]);
 783     const uint32 r = REPEAT8(src_argb0[2]);
 784     const uint32 a = REPEAT8(src_argb0[3]);
 785     const uint32 b_scale = src_argb1[0];
 786     const uint32 g_scale = src_argb1[1];
 787     const uint32 r_scale = src_argb1[2];
 788     const uint32 a_scale = src_argb1[3];
 789     dst_argb[0] = SHADE(b, b_scale);
 790     dst_argb[1] = SHADE(g, g_scale);
 791     dst_argb[2] = SHADE(r, r_scale);
 792     dst_argb[3] = SHADE(a, a_scale);
 793     src_argb0 += 4;
 794     src_argb1 += 4;
 795     dst_argb += 4;
 796   }
 797 }
 798 #undef REPEAT8
 799 #undef SHADE
 800
 801 #define SHADE(f, v) clamp255(v + f)
 802
 803 void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,
 804                   uint8* dst_argb, int width) {
 805   int i;
 806   for (i = 0; i < width; ++i) {
 807     const int b = src_argb0[0];
 808     const int g = src_argb0[1];
 809     const int r = src_argb0[2];
 810     const int a = src_argb0[3];
 811     const int b_add = src_argb1[0];
 812     const int g_add = src_argb1[1];
 813     const int r_add = src_argb1[2];
 814     const int a_add = src_argb1[3];
 815     dst_argb[0] = SHADE(b, b_add);
 816     dst_argb[1] = SHADE(g, g_add);
 817     dst_argb[2] = SHADE(r, r_add);
 818     dst_argb[3] = SHADE(a, a_add);
 819     src_argb0 += 4;
 820     src_argb1 += 4;
 821     dst_argb += 4;
 822   }
 823 }
 824 #undef SHADE
 825
 826 #define SHADE(f, v) clamp0(f - v)
 827
 828 void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1,
 829                        uint8* dst_argb, int width) {
 830   int i;
 831   for (i = 0; i < width; ++i) {
 832     const int b = src_argb0[0];
 833     const int g = src_argb0[1];
 834     const int r = src_argb0[2];
 835     const int a = src_argb0[3];
 836     const int b_sub = src_argb1[0];
 837     const int g_sub = src_argb1[1];
 838     const int r_sub = src_argb1[2];
 839     const int a_sub = src_argb1[3];
 840     dst_argb[0] = SHADE(b, b_sub);
 841     dst_argb[1] = SHADE(g, g_sub);
 842     dst_argb[2] = SHADE(r, r_sub);
 843     dst_argb[3] = SHADE(a, a_sub);
 844     src_argb0 += 4;
 845     src_argb1 += 4;
 846     dst_argb += 4;
 847   }
 848 }
 849 #undef SHADE
 850
 851 // Sobel functions which mimics SSSE3.
 852 void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
 853                  uint8* dst_sobelx, int width) {
 854   int i;
 855   for (i = 0; i < width; ++i) {
 856     int a = src_y0[i];
 857     int b = src_y1[i];
 858     int c = src_y2[i];
 859     int a_sub = src_y0[i + 2];
 860     int b_sub = src_y1[i + 2];
 861     int c_sub = src_y2[i + 2];
 862     int a_diff = a - a_sub;
 863     int b_diff = b - b_sub;
 864     int c_diff = c - c_sub;
 865     int sobel = Abs(a_diff + b_diff * 2 + c_diff);
 866     dst_sobelx[i] = (uint8)(clamp255(sobel));
 867   }
 868 }
 869
 870 void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
 871                  uint8* dst_sobely, int width) {
 872   int i;
 873   for (i = 0; i < width; ++i) {
 874     int a = src_y0[i + 0];
 875     int b = src_y0[i + 1];
 876     int c = src_y0[i + 2];
 877     int a_sub = src_y1[i + 0];
 878     int b_sub = src_y1[i + 1];
 879     int c_sub = src_y1[i + 2];
 880     int a_diff = a - a_sub;
 881     int b_diff = b - b_sub;
 882     int c_diff = c - c_sub;
 883     int sobel = Abs(a_diff + b_diff * 2 + c_diff);
 884     dst_sobely[i] = (uint8)(clamp255(sobel));
 885   }
 886 }
 887
 888 void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
 889                 uint8* dst_argb, int width) {
 890   int i;
 891   for (i = 0; i < width; ++i) {
 892     int r = src_sobelx[i];
 893     int b = src_sobely[i];
 894     int s = clamp255(r + b);
 895     dst_argb[0] = (uint8)(s);
 896     dst_argb[1] = (uint8)(s);
 897     dst_argb[2] = (uint8)(s);
 898     dst_argb[3] = (uint8)(255u);
 899     dst_argb += 4;
 900   }
 901 }
 902
 903 void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
 904                        uint8* dst_y, int width) {
 905   int i;
 906   for (i = 0; i < width; ++i) {
 907     int r = src_sobelx[i];
 908     int b = src_sobely[i];
 909     int s = clamp255(r + b);
 910     dst_y[i] = (uint8)(s);
 911   }
 912 }
 913
 914 void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
 915                   uint8* dst_argb, int width) {
 916   int i;
 917   for (i = 0; i < width; ++i) {
 918     int r = src_sobelx[i];
 919     int b = src_sobely[i];
 920     int g = clamp255(r + b);
 921     dst_argb[0] = (uint8)(b);
 922     dst_argb[1] = (uint8)(g);
 923     dst_argb[2] = (uint8)(r);
 924     dst_argb[3] = (uint8)(255u);
 925     dst_argb += 4;
 926   }
 927 }
 928
 929 void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
 930   // Copy a Y to RGB.
 931   int x;
 932   for (x = 0; x < width; ++x) {
 933     uint8 y = src_y[0];
 934     dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
 935     dst_argb[3] = 255u;
 936     dst_argb += 4;
 937     ++src_y;
 938   }
 939 }
 940
 941 // C reference code that mimics the YUV assembly.
 942
 943 #define YG 74 /* (int8)(1.164 * 64 + 0.5) */
 944
 945 #define UB 127 /* min(63,(int8)(2.018 * 64)) */
 946 #define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
 947 #define UR 0
 948
 949 #define VB 0
 950 #define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
 951 #define VR 102 /* (int8)(1.596 * 64 + 0.5) */
 952
 953 // Bias
 954 #define BB UB * 128 + VB * 128
 955 #define BG UG * 128 + VG * 128
 956 #define BR UR * 128 + VR * 128
 957
 958 static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
 959                               uint8* b, uint8* g, uint8* r) {
 960   int32 y1 = ((int32)(y) - 16) * YG;
 961   *b = Clamp((int32)((u * UB + v * VB) - (BB) + y1) >> 6);
 962   *g = Clamp((int32)((u * UG + v * VG) - (BG) + y1) >> 6);
 963   *r = Clamp((int32)((u * UR + v * VR) - (BR) + y1) >> 6);
 964 }
 965
 966 #if !defined(LIBYUV_DISABLE_NEON) && \
 967     (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 968 // C mimic assembly.
 969 // TODO(fbarchard): Remove subsampling from Neon.
 970 void I444ToARGBRow_C(const uint8* src_y,
 971                      const uint8* src_u,
 972                      const uint8* src_v,
 973                      uint8* rgb_buf,
 974                      int width) {
 975   int x;
 976   for (x = 0; x < width - 1; x += 2) {
 977     uint8 u = (src_u[0] + src_u[1] + 1) >> 1;
 978     uint8 v = (src_v[0] + src_v[1] + 1) >> 1;
 979     YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
 980     rgb_buf[3] = 255;
 981     YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
 982     rgb_buf[7] = 255;
 983     src_y += 2;
 984     src_u += 2;
 985     src_v += 2;
 986     rgb_buf += 8;  // Advance 2 pixels.
 987   }
 988   if (width & 1) {
 989     YuvPixel(src_y[0], src_u[0], src_v[0],
 990              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
 991   }
 992 }
 993 #else
 994 void I444ToARGBRow_C(const uint8* src_y,
 995                      const uint8* src_u,
 996                      const uint8* src_v,
 997                      uint8* rgb_buf,
 998                      int width) {
 999   int x;
1000   for (x = 0; x < width; ++x) {
1001     YuvPixel(src_y[0], src_u[0], src_v[0],
1002              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1003     rgb_buf[3] = 255;
1004     src_y += 1;
1005     src_u += 1;
1006     src_v += 1;
1007     rgb_buf += 4;  // Advance 1 pixel.
1008   }
1009 }
1010 #endif
1011 // Also used for 420
1012 void I422ToARGBRow_C(const uint8* src_y,
1013                      const uint8* src_u,
1014                      const uint8* src_v,
1015                      uint8* rgb_buf,
1016                      int width) {
1017   int x;
1018   for (x = 0; x < width - 1; x += 2) {
1019     YuvPixel(src_y[0], src_u[0], src_v[0],
1020              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1021     rgb_buf[3] = 255;
1022     YuvPixel(src_y[1], src_u[0], src_v[0],
1023              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1024     rgb_buf[7] = 255;
1025     src_y += 2;
1026     src_u += 1;
1027     src_v += 1;
1028     rgb_buf += 8;  // Advance 2 pixels.
1029   }
1030   if (width & 1) {
1031     YuvPixel(src_y[0], src_u[0], src_v[0],
1032              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1033     rgb_buf[3] = 255;
1034   }
1035 }
1036
1037 void I422ToRGB24Row_C(const uint8* src_y,
1038                       const uint8* src_u,
1039                       const uint8* src_v,
1040                       uint8* rgb_buf,
1041                       int width) {
1042   int x;
1043   for (x = 0; x < width - 1; x += 2) {
1044     YuvPixel(src_y[0], src_u[0], src_v[0],
1045              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1046     YuvPixel(src_y[1], src_u[0], src_v[0],
1047              rgb_buf + 3, rgb_buf + 4, rgb_buf + 5);
1048     src_y += 2;
1049     src_u += 1;
1050     src_v += 1;
1051     rgb_buf += 6;  // Advance 2 pixels.
1052   }
1053   if (width & 1) {
1054     YuvPixel(src_y[0], src_u[0], src_v[0],
1055              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1056   }
1057 }
1058
1059 void I422ToRAWRow_C(const uint8* src_y,
1060                     const uint8* src_u,
1061                     const uint8* src_v,
1062                     uint8* rgb_buf,
1063                     int width) {
1064   int x;
1065   for (x = 0; x < width - 1; x += 2) {
1066     YuvPixel(src_y[0], src_u[0], src_v[0],
1067              rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
1068     YuvPixel(src_y[1], src_u[0], src_v[0],
1069              rgb_buf + 5, rgb_buf + 4, rgb_buf + 3);
1070     src_y += 2;
1071     src_u += 1;
1072     src_v += 1;
1073     rgb_buf += 6;  // Advance 2 pixels.
1074   }
1075   if (width & 1) {
1076     YuvPixel(src_y[0], src_u[0], src_v[0],
1077              rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
1078   }
1079 }
1080
1081 void I422ToARGB4444Row_C(const uint8* src_y,
1082                          const uint8* src_u,
1083                          const uint8* src_v,
1084                          uint8* dst_argb4444,
1085                          int width) {
1086   uint8 b0;
1087   uint8 g0;
1088   uint8 r0;
1089   uint8 b1;
1090   uint8 g1;
1091   uint8 r1;
1092   int x;
1093   for (x = 0; x < width - 1; x += 2) {
1094     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
1095     YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
1096     b0 = b0 >> 4;
1097     g0 = g0 >> 4;
1098     r0 = r0 >> 4;
1099     b1 = b1 >> 4;
1100     g1 = g1 >> 4;
1101     r1 = r1 >> 4;
1102     *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
1103         (b1 << 16) | (g1 << 20) | (r1 << 24) | 0xf000f000;
1104     src_y += 2;
1105     src_u += 1;
1106     src_v += 1;
1107     dst_argb4444 += 4;  // Advance 2 pixels.
1108   }
1109   if (width & 1) {
1110     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
1111     b0 = b0 >> 4;
1112     g0 = g0 >> 4;
1113     r0 = r0 >> 4;
1114     *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
1115         0xf000;
1116   }
1117 }
1118
1119 void I422ToARGB1555Row_C(const uint8* src_y,
1120                          const uint8* src_u,
1121                          const uint8* src_v,
1122                          uint8* dst_argb1555,
1123                          int width) {
1124   uint8 b0;
1125   uint8 g0;
1126   uint8 r0;
1127   uint8 b1;
1128   uint8 g1;
1129   uint8 r1;
1130   int x;
1131   for (x = 0; x < width - 1; x += 2) {
1132     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
1133     YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
1134     b0 = b0 >> 3;
1135     g0 = g0 >> 3;
1136     r0 = r0 >> 3;
1137     b1 = b1 >> 3;
1138     g1 = g1 >> 3;
1139     r1 = r1 >> 3;
1140     *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
1141         (b1 << 16) | (g1 << 21) | (r1 << 26) | 0x80008000;
1142     src_y += 2;
1143     src_u += 1;
1144     src_v += 1;
1145     dst_argb1555 += 4;  // Advance 2 pixels.
1146   }
1147   if (width & 1) {
1148     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
1149     b0 = b0 >> 3;
1150     g0 = g0 >> 3;
1151     r0 = r0 >> 3;
1152     *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
1153         0x8000;
1154   }
1155 }
1156
1157 void I422ToRGB565Row_C(const uint8* src_y,
1158                        const uint8* src_u,
1159                        const uint8* src_v,
1160                        uint8* dst_rgb565,
1161                        int width) {
1162   uint8 b0;
1163   uint8 g0;
1164   uint8 r0;
1165   uint8 b1;
1166   uint8 g1;
1167   uint8 r1;
1168   int x;
1169   for (x = 0; x < width - 1; x += 2) {
1170     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
1171     YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
1172     b0 = b0 >> 3;
1173     g0 = g0 >> 2;
1174     r0 = r0 >> 3;
1175     b1 = b1 >> 3;
1176     g1 = g1 >> 2;
1177     r1 = r1 >> 3;
1178     *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
1179         (b1 << 16) | (g1 << 21) | (r1 << 27);
1180     src_y += 2;
1181     src_u += 1;
1182     src_v += 1;
1183     dst_rgb565 += 4;  // Advance 2 pixels.
1184   }
1185   if (width & 1) {
1186     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
1187     b0 = b0 >> 3;
1188     g0 = g0 >> 2;
1189     r0 = r0 >> 3;
1190     *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
1191   }
1192 }
1193
1194 void I411ToARGBRow_C(const uint8* src_y,
1195                      const uint8* src_u,
1196                      const uint8* src_v,
1197                      uint8* rgb_buf,
1198                      int width) {
1199   int x;
1200   for (x = 0; x < width - 3; x += 4) {
1201     YuvPixel(src_y[0], src_u[0], src_v[0],
1202              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1203     rgb_buf[3] = 255;
1204     YuvPixel(src_y[1], src_u[0], src_v[0],
1205              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1206     rgb_buf[7] = 255;
1207     YuvPixel(src_y[2], src_u[0], src_v[0],
1208              rgb_buf + 8, rgb_buf + 9, rgb_buf + 10);
1209     rgb_buf[11] = 255;
1210     YuvPixel(src_y[3], src_u[0], src_v[0],
1211              rgb_buf + 12, rgb_buf + 13, rgb_buf + 14);
1212     rgb_buf[15] = 255;
1213     src_y += 4;
1214     src_u += 1;
1215     src_v += 1;
1216     rgb_buf += 16;  // Advance 4 pixels.
1217   }
1218   if (width & 2) {
1219     YuvPixel(src_y[0], src_u[0], src_v[0],
1220              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1221     rgb_buf[3] = 255;
1222     YuvPixel(src_y[1], src_u[0], src_v[0],
1223              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1224     rgb_buf[7] = 255;
1225     src_y += 2;
1226     rgb_buf += 8;  // Advance 2 pixels.
1227   }
1228   if (width & 1) {
1229     YuvPixel(src_y[0], src_u[0], src_v[0],
1230              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1231     rgb_buf[3] = 255;
1232   }
1233 }
1234
1235 void NV12ToARGBRow_C(const uint8* src_y,
1236                      const uint8* usrc_v,
1237                      uint8* rgb_buf,
1238                      int width) {
1239   int x;
1240   for (x = 0; x < width - 1; x += 2) {
1241     YuvPixel(src_y[0], usrc_v[0], usrc_v[1],
1242              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1243     rgb_buf[3] = 255;
1244     YuvPixel(src_y[1], usrc_v[0], usrc_v[1],
1245              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1246     rgb_buf[7] = 255;
1247     src_y += 2;
1248     usrc_v += 2;
1249     rgb_buf += 8;  // Advance 2 pixels.
1250   }
1251   if (width & 1) {
1252     YuvPixel(src_y[0], usrc_v[0], usrc_v[1],
1253              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1254     rgb_buf[3] = 255;
1255   }
1256 }
1257
1258 void NV21ToARGBRow_C(const uint8* src_y,
1259                      const uint8* src_vu,
1260                      uint8* rgb_buf,
1261                      int width) {
1262   int x;
1263   for (x = 0; x < width - 1; x += 2) {
1264     YuvPixel(src_y[0], src_vu[1], src_vu[0],
1265              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1266     rgb_buf[3] = 255;
1267
1268     YuvPixel(src_y[1], src_vu[1], src_vu[0],
1269              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1270     rgb_buf[7] = 255;
1271
1272     src_y += 2;
1273     src_vu += 2;
1274     rgb_buf += 8;  // Advance 2 pixels.
1275   }
1276   if (width & 1) {
1277     YuvPixel(src_y[0], src_vu[1], src_vu[0],
1278              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1279     rgb_buf[3] = 255;
1280   }
1281 }
1282
1283 void NV12ToRGB565Row_C(const uint8* src_y,
1284                        const uint8* usrc_v,
1285                        uint8* dst_rgb565,
1286                        int width) {
1287   uint8 b0;
1288   uint8 g0;
1289   uint8 r0;
1290   uint8 b1;
1291   uint8 g1;
1292   uint8 r1;
1293   int x;
1294   for (x = 0; x < width - 1; x += 2) {
1295     YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0);
1296     YuvPixel(src_y[1], usrc_v[0], usrc_v[1], &b1, &g1, &r1);
1297     b0 = b0 >> 3;
1298     g0 = g0 >> 2;
1299     r0 = r0 >> 3;
1300     b1 = b1 >> 3;
1301     g1 = g1 >> 2;
1302     r1 = r1 >> 3;
1303     *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
1304         (b1 << 16) | (g1 << 21) | (r1 << 27);
1305     src_y += 2;
1306     usrc_v += 2;
1307     dst_rgb565 += 4;  // Advance 2 pixels.
1308   }
1309   if (width & 1) {
1310     YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0);
1311     b0 = b0 >> 3;
1312     g0 = g0 >> 2;
1313     r0 = r0 >> 3;
1314     *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
1315   }
1316 }
1317
1318 void NV21ToRGB565Row_C(const uint8* src_y,
1319                        const uint8* vsrc_u,
1320                        uint8* dst_rgb565,
1321                        int width) {
1322   uint8 b0;
1323   uint8 g0;
1324   uint8 r0;
1325   uint8 b1;
1326   uint8 g1;
1327   uint8 r1;
1328   int x;
1329   for (x = 0; x < width - 1; x += 2) {
1330     YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);
1331     YuvPixel(src_y[1], vsrc_u[1], vsrc_u[0], &b1, &g1, &r1);
1332     b0 = b0 >> 3;
1333     g0 = g0 >> 2;
1334     r0 = r0 >> 3;
1335     b1 = b1 >> 3;
1336     g1 = g1 >> 2;
1337     r1 = r1 >> 3;
1338     *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
1339         (b1 << 16) | (g1 << 21) | (r1 << 27);
1340     src_y += 2;
1341     vsrc_u += 2;
1342     dst_rgb565 += 4;  // Advance 2 pixels.
1343   }
1344   if (width & 1) {
1345     YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);
1346     b0 = b0 >> 3;
1347     g0 = g0 >> 2;
1348     r0 = r0 >> 3;
1349     *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
1350   }
1351 }
1352
1353 void YUY2ToARGBRow_C(const uint8* src_yuy2,
1354                      uint8* rgb_buf,
1355                      int width) {
1356   int x;
1357   for (x = 0; x < width - 1; x += 2) {
1358     YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
1359              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1360     rgb_buf[3] = 255;
1361     YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3],
1362              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1363     rgb_buf[7] = 255;
1364     src_yuy2 += 4;
1365     rgb_buf += 8;  // Advance 2 pixels.
1366   }
1367   if (width & 1) {
1368     YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
1369              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1370     rgb_buf[3] = 255;
1371   }
1372 }
1373
1374 void UYVYToARGBRow_C(const uint8* src_uyvy,
1375                      uint8* rgb_buf,
1376                      int width) {
1377   int x;
1378   for (x = 0; x < width - 1; x += 2) {
1379     YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
1380              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1381     rgb_buf[3] = 255;
1382     YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2],
1383              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1384     rgb_buf[7] = 255;
1385     src_uyvy += 4;
1386     rgb_buf += 8;  // Advance 2 pixels.
1387   }
1388   if (width & 1) {
1389     YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
1390              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1391     rgb_buf[3] = 255;
1392   }
1393 }
1394
1395 void I422ToBGRARow_C(const uint8* src_y,
1396                      const uint8* src_u,
1397                      const uint8* src_v,
1398                      uint8* rgb_buf,
1399                      int width) {
1400   int x;
1401   for (x = 0; x < width - 1; x += 2) {
1402     YuvPixel(src_y[0], src_u[0], src_v[0],
1403              rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);
1404     rgb_buf[0] = 255;
1405     YuvPixel(src_y[1], src_u[0], src_v[0],
1406              rgb_buf + 7, rgb_buf + 6, rgb_buf + 5);
1407     rgb_buf[4] = 255;
1408     src_y += 2;
1409     src_u += 1;
1410     src_v += 1;
1411     rgb_buf += 8;  // Advance 2 pixels.
1412   }
1413   if (width & 1) {
1414     YuvPixel(src_y[0], src_u[0], src_v[0],
1415              rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);
1416     rgb_buf[0] = 255;
1417   }
1418 }
1419
1420 void I422ToABGRRow_C(const uint8* src_y,
1421                      const uint8* src_u,
1422                      const uint8* src_v,
1423                      uint8* rgb_buf,
1424                      int width) {
1425   int x;
1426   for (x = 0; x < width - 1; x += 2) {
1427     YuvPixel(src_y[0], src_u[0], src_v[0],
1428              rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
1429     rgb_buf[3] = 255;
1430     YuvPixel(src_y[1], src_u[0], src_v[0],
1431              rgb_buf + 6, rgb_buf + 5, rgb_buf + 4);
1432     rgb_buf[7] = 255;
1433     src_y += 2;
1434     src_u += 1;
1435     src_v += 1;
1436     rgb_buf += 8;  // Advance 2 pixels.
1437   }
1438   if (width & 1) {
1439     YuvPixel(src_y[0], src_u[0], src_v[0],
1440              rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
1441     rgb_buf[3] = 255;
1442   }
1443 }
1444
1445 void I422ToRGBARow_C(const uint8* src_y,
1446                      const uint8* src_u,
1447                      const uint8* src_v,
1448                      uint8* rgb_buf,
1449                      int width) {
1450   int x;
1451   for (x = 0; x < width - 1; x += 2) {
1452     YuvPixel(src_y[0], src_u[0], src_v[0],
1453              rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);
1454     rgb_buf[0] = 255;
1455     YuvPixel(src_y[1], src_u[0], src_v[0],
1456              rgb_buf + 5, rgb_buf + 6, rgb_buf + 7);
1457     rgb_buf[4] = 255;
1458     src_y += 2;
1459     src_u += 1;
1460     src_v += 1;
1461     rgb_buf += 8;  // Advance 2 pixels.
1462   }
1463   if (width & 1) {
1464     YuvPixel(src_y[0], src_u[0], src_v[0],
1465              rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);
1466     rgb_buf[0] = 255;
1467   }
1468 }
1469
1470 void YToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
1471   int x;
1472   for (x = 0; x < width - 1; x += 2) {
1473     YuvPixel(src_y[0], 128, 128,
1474              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1475     rgb_buf[3] = 255;
1476     YuvPixel(src_y[1], 128, 128,
1477              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1478     rgb_buf[7] = 255;
1479     src_y += 2;
1480     rgb_buf += 8;  // Advance 2 pixels.
1481   }
1482   if (width & 1) {
1483     YuvPixel(src_y[0], 128, 128,
1484              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1485     rgb_buf[3] = 255;
1486   }
1487 }
1488
1489 void MirrorRow_C(const uint8* src, uint8* dst, int width) {
1490   int x;
1491   src += width - 1;
1492   for (x = 0; x < width - 1; x += 2) {
1493     dst[x] = src[0];
1494     dst[x + 1] = src[-1];
1495     src -= 2;
1496   }
1497   if (width & 1) {
1498     dst[width - 1] = src[0];
1499   }
1500 }
1501
1502 void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
1503   int x;
1504   src_uv += (width - 1) << 1;
1505   for (x = 0; x < width - 1; x += 2) {
1506     dst_u[x] = src_uv[0];
1507     dst_u[x + 1] = src_uv[-2];
1508     dst_v[x] = src_uv[1];
1509     dst_v[x + 1] = src_uv[-2 + 1];
1510     src_uv -= 4;
1511   }
1512   if (width & 1) {
1513     dst_u[width - 1] = src_uv[0];
1514     dst_v[width - 1] = src_uv[1];
1515   }
1516 }
1517
1518 void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {
1519   int x;
1520   const uint32* src32 = (const uint32*)(src);
1521   uint32* dst32 = (uint32*)(dst);
1522   src32 += width - 1;
1523   for (x = 0; x < width - 1; x += 2) {
1524     dst32[x] = src32[0];
1525     dst32[x + 1] = src32[-1];
1526     src32 -= 2;
1527   }
1528   if (width & 1) {
1529     dst32[width - 1] = src32[0];
1530   }
1531 }
1532
1533 void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
1534   int x;
1535   for (x = 0; x < width - 1; x += 2) {
1536     dst_u[x] = src_uv[0];
1537     dst_u[x + 1] = src_uv[2];
1538     dst_v[x] = src_uv[1];
1539     dst_v[x + 1] = src_uv[3];
1540     src_uv += 4;
1541   }
1542   if (width & 1) {
1543     dst_u[width - 1] = src_uv[0];
1544     dst_v[width - 1] = src_uv[1];
1545   }
1546 }
1547
1548 void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
1549                   int width) {
1550   int x;
1551   for (x = 0; x < width - 1; x += 2) {
1552     dst_uv[0] = src_u[x];
1553     dst_uv[1] = src_v[x];
1554     dst_uv[2] = src_u[x + 1];
1555     dst_uv[3] = src_v[x + 1];
1556     dst_uv += 4;
1557   }
1558   if (width & 1) {
1559     dst_uv[0] = src_u[width - 1];
1560     dst_uv[1] = src_v[width - 1];
1561   }
1562 }
1563
1564 void CopyRow_C(const uint8* src, uint8* dst, int count) {
1565   memcpy(dst, src, count);
1566 }
1567
1568 void SetRow_C(uint8* dst, uint32 v8, int count) {
1569 #ifdef _MSC_VER
1570   // VC will generate rep stosb.
1571   int x;
1572   for (x = 0; x < count; ++x) {
1573     dst[x] = v8;
1574   }
1575 #else
1576   memset(dst, v8, count);
1577 #endif
1578 }
1579
1580 void ARGBSetRows_C(uint8* dst, uint32 v32, int width,
1581                  int dst_stride, int height) {
1582   int y;
1583   for (y = 0; y < height; ++y) {
1584     uint32* d = (uint32*)(dst);
1585     int x;
1586     for (x = 0; x < width; ++x) {
1587       d[x] = v32;
1588     }
1589     dst += dst_stride;
1590   }
1591 }
1592
1593 // Filter 2 rows of YUY2 UV's (422) into U and V (420).
1594 void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
1595                    uint8* dst_u, uint8* dst_v, int width) {
1596   // Output a row of UV values, filtering 2 rows of YUY2.
1597   int x;
1598   for (x = 0; x < width; x += 2) {
1599     dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
1600     dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
1601     src_yuy2 += 4;
1602     dst_u += 1;
1603     dst_v += 1;
1604   }
1605 }
1606
1607 // Copy row of YUY2 UV's (422) into U and V (422).
1608 void YUY2ToUV422Row_C(const uint8* src_yuy2,
1609                       uint8* dst_u, uint8* dst_v, int width) {
1610   // Output a row of UV values.
1611   int x;
1612   for (x = 0; x < width; x += 2) {
1613     dst_u[0] = src_yuy2[1];
1614     dst_v[0] = src_yuy2[3];
1615     src_yuy2 += 4;
1616     dst_u += 1;
1617     dst_v += 1;
1618   }
1619 }
1620
1621 // Copy row of YUY2 Y's (422) into Y (420/422).
1622 void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
1623   // Output a row of Y values.
1624   int x;
1625   for (x = 0; x < width - 1; x += 2) {
1626     dst_y[x] = src_yuy2[0];
1627     dst_y[x + 1] = src_yuy2[2];
1628     src_yuy2 += 4;
1629   }
1630   if (width & 1) {
1631     dst_y[width - 1] = src_yuy2[0];
1632   }
1633 }
1634
1635 // Filter 2 rows of UYVY UV's (422) into U and V (420).
1636 void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
1637                    uint8* dst_u, uint8* dst_v, int width) {
1638   // Output a row of UV values.
1639   int x;
1640   for (x = 0; x < width; x += 2) {
1641     dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
1642     dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
1643     src_uyvy += 4;
1644     dst_u += 1;
1645     dst_v += 1;
1646   }
1647 }
1648
1649 // Copy row of UYVY UV's (422) into U and V (422).
1650 void UYVYToUV422Row_C(const uint8* src_uyvy,
1651                       uint8* dst_u, uint8* dst_v, int width) {
1652   // Output a row of UV values.
1653   int x;
1654   for (x = 0; x < width; x += 2) {
1655     dst_u[0] = src_uyvy[0];
1656     dst_v[0] = src_uyvy[2];
1657     src_uyvy += 4;
1658     dst_u += 1;
1659     dst_v += 1;
1660   }
1661 }
1662
1663 // Copy row of UYVY Y's (422) into Y (420/422).
1664 void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
1665   // Output a row of Y values.
1666   int x;
1667   for (x = 0; x < width - 1; x += 2) {
1668     dst_y[x] = src_uyvy[1];
1669     dst_y[x + 1] = src_uyvy[3];
1670     src_uyvy += 4;
1671   }
1672   if (width & 1) {
1673     dst_y[width - 1] = src_uyvy[1];
1674   }
1675 }
1676
1677 #define BLEND(f, b, a) (((256 - a) * b) >> 8) + f
1678
1679 // Blend src_argb0 over src_argb1 and store to dst_argb.
1680 // dst_argb may be src_argb0 or src_argb1.
1681 // This code mimics the SSSE3 version for better testability.
1682 void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
1683                     uint8* dst_argb, int width) {
1684   int x;
1685   for (x = 0; x < width - 1; x += 2) {
1686     uint32 fb = src_argb0[0];
1687     uint32 fg = src_argb0[1];
1688     uint32 fr = src_argb0[2];
1689     uint32 a = src_argb0[3];
1690     uint32 bb = src_argb1[0];
1691     uint32 bg = src_argb1[1];
1692     uint32 br = src_argb1[2];
1693     dst_argb[0] = BLEND(fb, bb, a);
1694     dst_argb[1] = BLEND(fg, bg, a);
1695     dst_argb[2] = BLEND(fr, br, a);
1696     dst_argb[3] = 255u;
1697
1698     fb = src_argb0[4 + 0];
1699     fg = src_argb0[4 + 1];
1700     fr = src_argb0[4 + 2];
1701     a = src_argb0[4 + 3];
1702     bb = src_argb1[4 + 0];
1703     bg = src_argb1[4 + 1];
1704     br = src_argb1[4 + 2];
1705     dst_argb[4 + 0] = BLEND(fb, bb, a);
1706     dst_argb[4 + 1] = BLEND(fg, bg, a);
1707     dst_argb[4 + 2] = BLEND(fr, br, a);
1708     dst_argb[4 + 3] = 255u;
1709     src_argb0 += 8;
1710     src_argb1 += 8;
1711     dst_argb += 8;
1712   }
1713
1714   if (width & 1) {
1715     uint32 fb = src_argb0[0];
1716     uint32 fg = src_argb0[1];
1717     uint32 fr = src_argb0[2];
1718     uint32 a = src_argb0[3];
1719     uint32 bb = src_argb1[0];
1720     uint32 bg = src_argb1[1];
1721     uint32 br = src_argb1[2];
1722     dst_argb[0] = BLEND(fb, bb, a);
1723     dst_argb[1] = BLEND(fg, bg, a);
1724     dst_argb[2] = BLEND(fr, br, a);
1725     dst_argb[3] = 255u;
1726   }
1727 }
1728 #undef BLEND
1729 #define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
1730
1731 // Multiply source RGB by alpha and store to destination.
1732 // This code mimics the SSSE3 version for better testability.
1733 void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
1734   int i;
1735   for (i = 0; i < width - 1; i += 2) {
1736     uint32 b = src_argb[0];
1737     uint32 g = src_argb[1];
1738     uint32 r = src_argb[2];
1739     uint32 a = src_argb[3];
1740     dst_argb[0] = ATTENUATE(b, a);
1741     dst_argb[1] = ATTENUATE(g, a);
1742     dst_argb[2] = ATTENUATE(r, a);
1743     dst_argb[3] = a;
1744     b = src_argb[4];
1745     g = src_argb[5];
1746     r = src_argb[6];
1747     a = src_argb[7];
1748     dst_argb[4] = ATTENUATE(b, a);
1749     dst_argb[5] = ATTENUATE(g, a);
1750     dst_argb[6] = ATTENUATE(r, a);
1751     dst_argb[7] = a;
1752     src_argb += 8;
1753     dst_argb += 8;
1754   }
1755
1756   if (width & 1) {
1757     const uint32 b = src_argb[0];
1758     const uint32 g = src_argb[1];
1759     const uint32 r = src_argb[2];
1760     const uint32 a = src_argb[3];
1761     dst_argb[0] = ATTENUATE(b, a);
1762     dst_argb[1] = ATTENUATE(g, a);
1763     dst_argb[2] = ATTENUATE(r, a);
1764     dst_argb[3] = a;
1765   }
1766 }
1767 #undef ATTENUATE
1768
1769 // Divide source RGB by alpha and store to destination.
1770 // b = (b * 255 + (a / 2)) / a;
1771 // g = (g * 255 + (a / 2)) / a;
1772 // r = (r * 255 + (a / 2)) / a;
1773 // Reciprocal method is off by 1 on some values. ie 125
1774 // 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
1775 #define T(a) 0x01000000 + (0x10000 / a)
1776 const uint32 fixed_invtbl8[256] = {
1777   0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
1778   T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
1779   T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
1780   T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
1781   T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
1782   T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
1783   T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
1784   T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
1785   T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
1786   T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
1787   T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
1788   T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
1789   T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
1790   T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
1791   T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
1792   T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
1793   T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
1794   T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
1795   T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
1796   T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
1797   T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
1798   T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
1799   T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
1800   T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
1801   T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
1802   T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
1803   T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
1804   T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
1805   T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
1806   T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
1807   T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
1808   T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 };
1809 #undef T
1810
1811 void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
1812   int i;
1813   for (i = 0; i < width; ++i) {
1814     uint32 b = src_argb[0];
1815     uint32 g = src_argb[1];
1816     uint32 r = src_argb[2];
1817     const uint32 a = src_argb[3];
1818     const uint32 ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point
1819     b = (b * ia) >> 8;
1820     g = (g * ia) >> 8;
1821     r = (r * ia) >> 8;
1822     // Clamping should not be necessary but is free in assembly.
1823     dst_argb[0] = clamp255(b);
1824     dst_argb[1] = clamp255(g);
1825     dst_argb[2] = clamp255(r);
1826     dst_argb[3] = a;
1827     src_argb += 4;
1828     dst_argb += 4;
1829   }
1830 }
1831
1832 void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
1833                                const int32* previous_cumsum, int width) {
1834   int32 row_sum[4] = {0, 0, 0, 0};
1835   int x;
1836   for (x = 0; x < width; ++x) {
1837     row_sum[0] += row[x * 4 + 0];
1838     row_sum[1] += row[x * 4 + 1];
1839     row_sum[2] += row[x * 4 + 2];
1840     row_sum[3] += row[x * 4 + 3];
1841     cumsum[x * 4 + 0] = row_sum[0]  + previous_cumsum[x * 4 + 0];
1842     cumsum[x * 4 + 1] = row_sum[1]  + previous_cumsum[x * 4 + 1];
1843     cumsum[x * 4 + 2] = row_sum[2]  + previous_cumsum[x * 4 + 2];
1844     cumsum[x * 4 + 3] = row_sum[3]  + previous_cumsum[x * 4 + 3];
1845   }
1846 }
1847
1848 void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl,
1849                                 int w, int area, uint8* dst, int count) {
1850   float ooa = 1.0f / area;
1851   int i;
1852   for (i = 0; i < count; ++i) {
1853     dst[0] = (uint8)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
1854     dst[1] = (uint8)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
1855     dst[2] = (uint8)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
1856     dst[3] = (uint8)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
1857     dst += 4;
1858     tl += 4;
1859     bl += 4;
1860   }
1861 }
1862
1863 // Copy pixels from rotated source to destination row with a slope.
1864 LIBYUV_API
1865 void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
1866                      uint8* dst_argb, const float* uv_dudv, int width) {
1867   int i;
1868   // Render a row of pixels from source into a buffer.
1869   float uv[2];
1870   uv[0] = uv_dudv[0];
1871   uv[1] = uv_dudv[1];
1872   for (i = 0; i < width; ++i) {
1873     int x = (int)(uv[0]);
1874     int y = (int)(uv[1]);
1875     *(uint32*)(dst_argb) =
1876         *(const uint32*)(src_argb + y * src_argb_stride +
1877                                          x * 4);
1878     dst_argb += 4;
1879     uv[0] += uv_dudv[2];
1880     uv[1] += uv_dudv[3];
1881   }
1882 }
1883
1884 // Blend 2 rows into 1 for conversions such as I422ToI420.
1885 void HalfRow_C(const uint8* src_uv, int src_uv_stride,
1886                uint8* dst_uv, int pix) {
1887   int x;
1888   for (x = 0; x < pix; ++x) {
1889     dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
1890   }
1891 }
1892
1893 // C version 2x2 -> 2x1.
1894 void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
1895                       ptrdiff_t src_stride,
1896                       int width, int source_y_fraction) {
1897   int y1_fraction = source_y_fraction;
1898   int y0_fraction = 256 - y1_fraction;
1899   const uint8* src_ptr1 = src_ptr + src_stride;
1900   int x;
1901   if (source_y_fraction == 0) {
1902     memcpy(dst_ptr, src_ptr, width);
1903     return;
1904   }
1905   if (source_y_fraction == 128) {
1906     HalfRow_C(src_ptr, (int)(src_stride), dst_ptr, width);
1907     return;
1908   }
1909   for (x = 0; x < width - 1; x += 2) {
1910     dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
1911     dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
1912     src_ptr += 2;
1913     src_ptr1 += 2;
1914     dst_ptr += 2;
1915   }
1916   if (width & 1) {
1917     dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
1918   }
1919 }
1920
1921 // Select 2 channels from ARGB on alternating pixels.  e.g.  BGBGBGBG
1922 void ARGBToBayerRow_C(const uint8* src_argb,
1923                       uint8* dst_bayer, uint32 selector, int pix) {
1924   int index0 = selector & 0xff;
1925   int index1 = (selector >> 8) & 0xff;
1926   // Copy a row of Bayer.
1927   int x;
1928   for (x = 0; x < pix - 1; x += 2) {
1929     dst_bayer[0] = src_argb[index0];
1930     dst_bayer[1] = src_argb[index1];
1931     src_argb += 8;
1932     dst_bayer += 2;
1933   }
1934   if (pix & 1) {
1935     dst_bayer[0] = src_argb[index0];
1936   }
1937 }
1938
1939 // Select G channel from ARGB.  e.g.  GGGGGGGG
1940 void ARGBToBayerGGRow_C(const uint8* src_argb,
1941                         uint8* dst_bayer, uint32 selector, int pix) {
1942   // Copy a row of G.
1943   int x;
1944   for (x = 0; x < pix - 1; x += 2) {
1945     dst_bayer[0] = src_argb[1];
1946     dst_bayer[1] = src_argb[5];
1947     src_argb += 8;
1948     dst_bayer += 2;
1949   }
1950   if (pix & 1) {
1951     dst_bayer[0] = src_argb[1];
1952   }
1953 }
1954
1955 // Use first 4 shuffler values to reorder ARGB channels.
1956 void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
1957                       const uint8* shuffler, int pix) {
1958   int index0 = shuffler[0];
1959   int index1 = shuffler[1];
1960   int index2 = shuffler[2];
1961   int index3 = shuffler[3];
1962   // Shuffle a row of ARGB.
1963   int x;
1964   for (x = 0; x < pix; ++x) {
1965     // To support in-place conversion.
1966     uint8 b = src_argb[index0];
1967     uint8 g = src_argb[index1];
1968     uint8 r = src_argb[index2];
1969     uint8 a = src_argb[index3];
1970     dst_argb[0] = b;
1971     dst_argb[1] = g;
1972     dst_argb[2] = r;
1973     dst_argb[3] = a;
1974     src_argb += 4;
1975     dst_argb += 4;
1976   }
1977 }
1978
1979 void I422ToYUY2Row_C(const uint8* src_y,
1980                      const uint8* src_u,
1981                      const uint8* src_v,
1982                      uint8* dst_frame, int width) {
1983   int x;
1984   for (x = 0; x < width - 1; x += 2) {
1985     dst_frame[0] = src_y[0];
1986     dst_frame[1] = src_u[0];
1987     dst_frame[2] = src_y[1];
1988     dst_frame[3] = src_v[0];
1989     dst_frame += 4;
1990     src_y += 2;
1991     src_u += 1;
1992     src_v += 1;
1993   }
1994   if (width & 1) {
1995     dst_frame[0] = src_y[0];
1996     dst_frame[1] = src_u[0];
1997     dst_frame[2] = src_y[0];  // duplicate last y
1998     dst_frame[3] = src_v[0];
1999   }
2000 }
2001
2002 void I422ToUYVYRow_C(const uint8* src_y,
2003                      const uint8* src_u,
2004                      const uint8* src_v,
2005                      uint8* dst_frame, int width) {
2006   int x;
2007   for (x = 0; x < width - 1; x += 2) {
2008     dst_frame[0] = src_u[0];
2009     dst_frame[1] = src_y[0];
2010     dst_frame[2] = src_v[0];
2011     dst_frame[3] = src_y[1];
2012     dst_frame += 4;
2013     src_y += 2;
2014     src_u += 1;
2015     src_v += 1;
2016   }
2017   if (width & 1) {
2018     dst_frame[0] = src_u[0];
2019     dst_frame[1] = src_y[0];
2020     dst_frame[2] = src_v[0];
2021     dst_frame[3] = src_y[0];  // duplicate last y
2022   }
2023 }
2024
2025 #if !defined(LIBYUV_DISABLE_X86) && defined(HAS_I422TOARGBROW_SSSE3)
2026 // row_win.cc has asm version, but GCC uses 2 step wrapper.
2027 #if !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__))
2028 void I422ToRGB565Row_SSSE3(const uint8* src_y,
2029                            const uint8* src_u,
2030                            const uint8* src_v,
2031                            uint8* rgb_buf,
2032                            int width) {
2033   // Allocate a row of ARGB.
2034   align_buffer_64(row, width * 4);
2035   I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
2036   ARGBToRGB565Row_SSE2(row, rgb_buf, width);
2037   free_aligned_buffer_64(row);
2038 }
2039 #endif  // !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__))
2040
2041 #if defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
2042 void I422ToARGB1555Row_SSSE3(const uint8* src_y,
2043                              const uint8* src_u,
2044                              const uint8* src_v,
2045                              uint8* rgb_buf,
2046                              int width) {
2047   // Allocate a row of ARGB.
2048   align_buffer_64(row, width * 4);
2049   I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
2050   ARGBToARGB1555Row_SSE2(row, rgb_buf, width);
2051   free_aligned_buffer_64(row);
2052 }
2053
2054 void I422ToARGB4444Row_SSSE3(const uint8* src_y,
2055                              const uint8* src_u,
2056                              const uint8* src_v,
2057                              uint8* rgb_buf,
2058                              int width) {
2059   // Allocate a row of ARGB.
2060   align_buffer_64(row, width * 4);
2061   I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
2062   ARGBToARGB4444Row_SSE2(row, rgb_buf, width);
2063   free_aligned_buffer_64(row);
2064 }
2065
2066 void NV12ToRGB565Row_SSSE3(const uint8* src_y,
2067                            const uint8* src_uv,
2068                            uint8* dst_rgb565,
2069                            int width) {
2070   // Allocate a row of ARGB.
2071   align_buffer_64(row, width * 4);
2072   NV12ToARGBRow_SSSE3(src_y, src_uv, row, width);
2073   ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
2074   free_aligned_buffer_64(row);
2075 }
2076
2077 void NV21ToRGB565Row_SSSE3(const uint8* src_y,
2078                            const uint8* src_vu,
2079                            uint8* dst_rgb565,
2080                            int width) {
2081   // Allocate a row of ARGB.
2082   align_buffer_64(row, width * 4);
2083   NV21ToARGBRow_SSSE3(src_y, src_vu, row, width);
2084   ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
2085   free_aligned_buffer_64(row);
2086 }
2087
2088 void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
2089                          uint8* dst_argb,
2090                          int width) {
2091   // Allocate a rows of yuv.
2092   align_buffer_64(row_y, ((width + 63) & ~63) * 2);
2093   uint8* row_u = row_y + ((width + 63) & ~63);
2094   uint8* row_v = row_u + ((width + 63) & ~63) / 2;
2095   YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, width);
2096   YUY2ToYRow_SSE2(src_yuy2, row_y, width);
2097   I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
2098   free_aligned_buffer_64(row_y);
2099 }
2100
2101 void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2,
2102                                    uint8* dst_argb,
2103                                    int width) {
2104   // Allocate a rows of yuv.
2105   align_buffer_64(row_y, ((width + 63) & ~63) * 2);
2106   uint8* row_u = row_y + ((width + 63) & ~63);
2107   uint8* row_v = row_u + ((width + 63) & ~63) / 2;
2108   YUY2ToUV422Row_Unaligned_SSE2(src_yuy2, row_u, row_v, width);
2109   YUY2ToYRow_Unaligned_SSE2(src_yuy2, row_y, width);
2110   I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
2111   free_aligned_buffer_64(row_y);
2112 }
2113
2114 void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
2115                          uint8* dst_argb,
2116                          int width) {
2117   // Allocate a rows of yuv.
2118   align_buffer_64(row_y, ((width + 63) & ~63) * 2);
2119   uint8* row_u = row_y + ((width + 63) & ~63);
2120   uint8* row_v = row_u + ((width + 63) & ~63) / 2;
2121   UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, width);
2122   UYVYToYRow_SSE2(src_uyvy, row_y, width);
2123   I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
2124   free_aligned_buffer_64(row_y);
2125 }
2126
2127 void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy,
2128                                    uint8* dst_argb,
2129                                    int width) {
2130   // Allocate a rows of yuv.
2131   align_buffer_64(row_y, ((width + 63) & ~63) * 2);
2132   uint8* row_u = row_y + ((width + 63) & ~63);
2133   uint8* row_v = row_u + ((width + 63) & ~63) / 2;
2134   UYVYToUV422Row_Unaligned_SSE2(src_uyvy, row_u, row_v, width);
2135   UYVYToYRow_Unaligned_SSE2(src_uyvy, row_y, width);
2136   I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
2137   free_aligned_buffer_64(row_y);
2138 }
2139
2140 #endif  // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
2141 #endif  // !defined(LIBYUV_DISABLE_X86)
2142
2143 void ARGBPolynomialRow_C(const uint8* src_argb,
2144                          uint8* dst_argb, const float* poly,
2145                          int width) {
2146   int i;
2147   for (i = 0; i < width; ++i) {
2148     float b = (float)(src_argb[0]);
2149     float g = (float)(src_argb[1]);
2150     float r = (float)(src_argb[2]);
2151     float a = (float)(src_argb[3]);
2152     float b2 = b * b;
2153     float g2 = g * g;
2154     float r2 = r * r;
2155     float a2 = a * a;
2156     float db = poly[0] + poly[4] * b;
2157     float dg = poly[1] + poly[5] * g;
2158     float dr = poly[2] + poly[6] * r;
2159     float da = poly[3] + poly[7] * a;
2160     float b3 = b2 * b;
2161     float g3 = g2 * g;
2162     float r3 = r2 * r;
2163     float a3 = a2 * a;
2164     db += poly[8] * b2;
2165     dg += poly[9] * g2;
2166     dr += poly[10] * r2;
2167     da += poly[11] * a2;
2168     db += poly[12] * b3;
2169     dg += poly[13] * g3;
2170     dr += poly[14] * r3;
2171     da += poly[15] * a3;
2172
2173     dst_argb[0] = Clamp((int32)(db));
2174     dst_argb[1] = Clamp((int32)(dg));
2175     dst_argb[2] = Clamp((int32)(dr));
2176     dst_argb[3] = Clamp((int32)(da));
2177     src_argb += 4;
2178     dst_argb += 4;
2179   }
2180 }
2181
2182 void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
2183                              const uint8* luma, uint32 lumacoeff) {
2184   uint32 bc = lumacoeff & 0xff;
2185   uint32 gc = (lumacoeff >> 8) & 0xff;
2186   uint32 rc = (lumacoeff >> 16) & 0xff;
2187
2188   int i;
2189   for (i = 0; i < width - 1; i += 2) {
2190     // Luminance in rows, color values in columns.
2191     const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
2192                            src_argb[2] * rc) & 0x7F00u) + luma;
2193     const uint8* luma1;
2194     dst_argb[0] = luma0[src_argb[0]];
2195     dst_argb[1] = luma0[src_argb[1]];
2196     dst_argb[2] = luma0[src_argb[2]];
2197     dst_argb[3] = src_argb[3];
2198     luma1 = ((src_argb[4] * bc + src_argb[5] * gc +
2199               src_argb[6] * rc) & 0x7F00u) + luma;
2200     dst_argb[4] = luma1[src_argb[4]];
2201     dst_argb[5] = luma1[src_argb[5]];
2202     dst_argb[6] = luma1[src_argb[6]];
2203     dst_argb[7] = src_argb[7];
2204     src_argb += 8;
2205     dst_argb += 8;
2206   }
2207   if (width & 1) {
2208     // Luminance in rows, color values in columns.
2209     const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
2210                            src_argb[2] * rc) & 0x7F00u) + luma;
2211     dst_argb[0] = luma0[src_argb[0]];
2212     dst_argb[1] = luma0[src_argb[1]];
2213     dst_argb[2] = luma0[src_argb[2]];
2214     dst_argb[3] = src_argb[3];
2215   }
2216 }
2217
2218 void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {
2219   int i;
2220   for (i = 0; i < width - 1; i += 2) {
2221     dst[3] = src[3];
2222     dst[7] = src[7];
2223     dst += 8;
2224     src += 8;
2225   }
2226   if (width & 1) {
2227     dst[3] = src[3];
2228   }
2229 }
2230
2231 void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {
2232   int i;
2233   for (i = 0; i < width - 1; i += 2) {
2234     dst[3] = src[0];
2235     dst[7] = src[1];
2236     dst += 8;
2237     src += 2;
2238   }
2239   if (width & 1) {
2240     dst[3] = src[0];
2241   }
2242 }
2243
2244 #ifdef __cplusplus
2245 }  // extern "C"
2246 }  // namespace libyuv
2247 #endif