src/third_party/libyuv/source/row_common.cc

   1 /*
   2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS. All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include "libyuv/row.h"
  12
  13 #include <string.h>  // For memcpy and memset.
  14
  15 #include "libyuv/basic_types.h"
  16
  17 #ifdef __cplusplus
  18 namespace libyuv {
  19 extern "C" {
  20 #endif
  21
  22 // llvm x86 is poor at ternary operator, so use branchless min/max.
  23
  24 #define USE_BRANCHLESS 1
  25 #if USE_BRANCHLESS
  26 static __inline int32 clamp0(int32 v) {
  27   return ((-(v) >> 31) & (v));
  28 }
  29
  30 static __inline int32 clamp255(int32 v) {
  31   return (((255 - (v)) >> 31) | (v)) & 255;
  32 }
  33
  34 static __inline uint32 Clamp(int32 val) {
  35   int v = clamp0(val);
  36   return (uint32)(clamp255(v));
  37 }
  38
  39 static __inline uint32 Abs(int32 v) {
  40   int m = v >> 31;
  41   return (v + m) ^ m;
  42 }
  43 #else  // USE_BRANCHLESS
  44 static __inline int32 clamp0(int32 v) {
  45   return (v < 0) ? 0 : v;
  46 }
  47
  48 static __inline int32 clamp255(int32 v) {
  49   return (v > 255) ? 255 : v;
  50 }
  51
  52 static __inline uint32 Clamp(int32 val) {
  53   int v = clamp0(val);
  54   return (uint32)(clamp255(v));
  55 }
  56
  57 static __inline uint32 Abs(int32 v) {
  58   return (v < 0) ? -v : v;
  59 }
  60 #endif  // USE_BRANCHLESS
  61
  62 #ifdef LIBYUV_LITTLE_ENDIAN
  63 #define WRITEWORD(p, v) *(uint32*)(p) = v
  64 #else
  65 static inline void WRITEWORD(uint8* p, uint32 v) {
  66   p[0] = (uint8)(v & 255);
  67   p[1] = (uint8)((v >> 8) & 255);
  68   p[2] = (uint8)((v >> 16) & 255);
  69   p[3] = (uint8)((v >> 24) & 255);
  70 }
  71 #endif
  72
  73 void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
  74   int x;
  75   for (x = 0; x < width; ++x) {
  76     uint8 b = src_rgb24[0];
  77     uint8 g = src_rgb24[1];
  78     uint8 r = src_rgb24[2];
  79     dst_argb[0] = b;
  80     dst_argb[1] = g;
  81     dst_argb[2] = r;
  82     dst_argb[3] = 255u;
  83     dst_argb += 4;
  84     src_rgb24 += 3;
  85   }
  86 }
  87
  88 void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
  89   int x;
  90   for (x = 0; x < width; ++x) {
  91     uint8 r = src_raw[0];
  92     uint8 g = src_raw[1];
  93     uint8 b = src_raw[2];
  94     dst_argb[0] = b;
  95     dst_argb[1] = g;
  96     dst_argb[2] = r;
  97     dst_argb[3] = 255u;
  98     dst_argb += 4;
  99     src_raw += 3;
 100   }
 101 }
 102
 103 void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {
 104   int x;
 105   for (x = 0; x < width; ++x) {
 106     uint8 b = src_rgb565[0] & 0x1f;
 107     uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
 108     uint8 r = src_rgb565[1] >> 3;
 109     dst_argb[0] = (b << 3) | (b >> 2);
 110     dst_argb[1] = (g << 2) | (g >> 4);
 111     dst_argb[2] = (r << 3) | (r >> 2);
 112     dst_argb[3] = 255u;
 113     dst_argb += 4;
 114     src_rgb565 += 2;
 115   }
 116 }
 117
 118 void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb,
 119                          int width) {
 120   int x;
 121   for (x = 0; x < width; ++x) {
 122     uint8 b = src_argb1555[0] & 0x1f;
 123     uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
 124     uint8 r = (src_argb1555[1] & 0x7c) >> 2;
 125     uint8 a = src_argb1555[1] >> 7;
 126     dst_argb[0] = (b << 3) | (b >> 2);
 127     dst_argb[1] = (g << 3) | (g >> 2);
 128     dst_argb[2] = (r << 3) | (r >> 2);
 129     dst_argb[3] = -a;
 130     dst_argb += 4;
 131     src_argb1555 += 2;
 132   }
 133 }
 134
 135 void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb,
 136                          int width) {
 137   int x;
 138   for (x = 0; x < width; ++x) {
 139     uint8 b = src_argb4444[0] & 0x0f;
 140     uint8 g = src_argb4444[0] >> 4;
 141     uint8 r = src_argb4444[1] & 0x0f;
 142     uint8 a = src_argb4444[1] >> 4;
 143     dst_argb[0] = (b << 4) | b;
 144     dst_argb[1] = (g << 4) | g;
 145     dst_argb[2] = (r << 4) | r;
 146     dst_argb[3] = (a << 4) | a;
 147     dst_argb += 4;
 148     src_argb4444 += 2;
 149   }
 150 }
 151
 152 void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
 153   int x;
 154   for (x = 0; x < width; ++x) {
 155     uint8 b = src_argb[0];
 156     uint8 g = src_argb[1];
 157     uint8 r = src_argb[2];
 158     dst_rgb[0] = b;
 159     dst_rgb[1] = g;
 160     dst_rgb[2] = r;
 161     dst_rgb += 3;
 162     src_argb += 4;
 163   }
 164 }
 165
 166 void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
 167   int x;
 168   for (x = 0; x < width; ++x) {
 169     uint8 b = src_argb[0];
 170     uint8 g = src_argb[1];
 171     uint8 r = src_argb[2];
 172     dst_rgb[0] = r;
 173     dst_rgb[1] = g;
 174     dst_rgb[2] = b;
 175     dst_rgb += 3;
 176     src_argb += 4;
 177   }
 178 }
 179
 180 void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
 181   int x;
 182   for (x = 0; x < width - 1; x += 2) {
 183     uint8 b0 = src_argb[0] >> 3;
 184     uint8 g0 = src_argb[1] >> 2;
 185     uint8 r0 = src_argb[2] >> 3;
 186     uint8 b1 = src_argb[4] >> 3;
 187     uint8 g1 = src_argb[5] >> 2;
 188     uint8 r1 = src_argb[6] >> 3;
 189     WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
 190               (b1 << 16) | (g1 << 21) | (r1 << 27));
 191     dst_rgb += 4;
 192     src_argb += 8;
 193   }
 194   if (width & 1) {
 195     uint8 b0 = src_argb[0] >> 3;
 196     uint8 g0 = src_argb[1] >> 2;
 197     uint8 r0 = src_argb[2] >> 3;
 198     *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
 199   }
 200 }
 201
 202 void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
 203   int x;
 204   for (x = 0; x < width - 1; x += 2) {
 205     uint8 b0 = src_argb[0] >> 3;
 206     uint8 g0 = src_argb[1] >> 3;
 207     uint8 r0 = src_argb[2] >> 3;
 208     uint8 a0 = src_argb[3] >> 7;
 209     uint8 b1 = src_argb[4] >> 3;
 210     uint8 g1 = src_argb[5] >> 3;
 211     uint8 r1 = src_argb[6] >> 3;
 212     uint8 a1 = src_argb[7] >> 7;
 213     *(uint32*)(dst_rgb) =
 214         b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
 215         (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
 216     dst_rgb += 4;
 217     src_argb += 8;
 218   }
 219   if (width & 1) {
 220     uint8 b0 = src_argb[0] >> 3;
 221     uint8 g0 = src_argb[1] >> 3;
 222     uint8 r0 = src_argb[2] >> 3;
 223     uint8 a0 = src_argb[3] >> 7;
 224     *(uint16*)(dst_rgb) =
 225         b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
 226   }
 227 }
 228
 229 void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
 230   int x;
 231   for (x = 0; x < width - 1; x += 2) {
 232     uint8 b0 = src_argb[0] >> 4;
 233     uint8 g0 = src_argb[1] >> 4;
 234     uint8 r0 = src_argb[2] >> 4;
 235     uint8 a0 = src_argb[3] >> 4;
 236     uint8 b1 = src_argb[4] >> 4;
 237     uint8 g1 = src_argb[5] >> 4;
 238     uint8 r1 = src_argb[6] >> 4;
 239     uint8 a1 = src_argb[7] >> 4;
 240     *(uint32*)(dst_rgb) =
 241         b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
 242         (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
 243     dst_rgb += 4;
 244     src_argb += 8;
 245   }
 246   if (width & 1) {
 247     uint8 b0 = src_argb[0] >> 4;
 248     uint8 g0 = src_argb[1] >> 4;
 249     uint8 r0 = src_argb[2] >> 4;
 250     uint8 a0 = src_argb[3] >> 4;
 251     *(uint16*)(dst_rgb) =
 252         b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
 253   }
 254 }
 255
 256 static __inline int RGBToY(uint8 r, uint8 g, uint8 b) {
 257   return (66 * r + 129 * g +  25 * b + 0x1080) >> 8;
 258 }
 259
 260 static __inline int RGBToU(uint8 r, uint8 g, uint8 b) {
 261   return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
 262 }
 263 static __inline int RGBToV(uint8 r, uint8 g, uint8 b) {
 264   return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
 265 }
 266
 267 #define MAKEROWY(NAME, R, G, B, BPP) \
 268 void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {       \
 269   int x;                                                                       \
 270   for (x = 0; x < width; ++x) {                                                \
 271     dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);               \
 272     src_argb0 += BPP;                                                          \
 273     dst_y += 1;                                                                \
 274   }                                                                            \
 275 }                                                                              \
 276 void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb,              \
 277                        uint8* dst_u, uint8* dst_v, int width) {                \
 278   const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
 279   int x;                                                                       \
 280   for (x = 0; x < width - 1; x += 2) {                                         \
 281     uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] +                              \
 282                src_rgb1[B] + src_rgb1[B + BPP]) >> 2;                          \
 283     uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] +                              \
 284                src_rgb1[G] + src_rgb1[G + BPP]) >> 2;                          \
 285     uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] +                              \
 286                src_rgb1[R] + src_rgb1[R + BPP]) >> 2;                          \
 287     dst_u[0] = RGBToU(ar, ag, ab);                                             \
 288     dst_v[0] = RGBToV(ar, ag, ab);                                             \
 289     src_rgb0 += BPP * 2;                                                       \
 290     src_rgb1 += BPP * 2;                                                       \
 291     dst_u += 1;                                                                \
 292     dst_v += 1;                                                                \
 293   }                                                                            \
 294   if (width & 1) {                                                             \
 295     uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                               \
 296     uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                               \
 297     uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                               \
 298     dst_u[0] = RGBToU(ar, ag, ab);                                             \
 299     dst_v[0] = RGBToV(ar, ag, ab);                                             \
 300   }                                                                            \
 301 }
 302
 303 MAKEROWY(ARGB, 2, 1, 0, 4)
 304 MAKEROWY(BGRA, 1, 2, 3, 4)
 305 MAKEROWY(ABGR, 0, 1, 2, 4)
 306 MAKEROWY(RGBA, 3, 2, 1, 4)
 307 MAKEROWY(RGB24, 2, 1, 0, 3)
 308 MAKEROWY(RAW, 0, 1, 2, 3)
 309 #undef MAKEROWY
 310
 311 // JPeg uses a variation on BT.601-1 full range
 312 // y =  0.29900 * r + 0.58700 * g + 0.11400 * b
 313 // u = -0.16874 * r - 0.33126 * g + 0.50000 * b  + center
 314 // v =  0.50000 * r - 0.41869 * g - 0.08131 * b  + center
 315 // BT.601 Mpeg range uses:
 316 // b 0.1016 * 255 = 25.908 = 25
 317 // g 0.5078 * 255 = 129.489 = 129
 318 // r 0.2578 * 255 = 65.739 = 66
 319 // JPeg 8 bit Y (not used):
 320 // b 0.11400 * 256 = 29.184 = 29
 321 // g 0.58700 * 256 = 150.272 = 150
 322 // r 0.29900 * 256 = 76.544 = 77
 323 // JPeg 7 bit Y:
 324 // b 0.11400 * 128 = 14.592 = 15
 325 // g 0.58700 * 128 = 75.136 = 75
 326 // r 0.29900 * 128 = 38.272 = 38
 327 // JPeg 8 bit U:
 328 // b  0.50000 * 255 = 127.5 = 127
 329 // g -0.33126 * 255 = -84.4713 = -84
 330 // r -0.16874 * 255 = -43.0287 = -43
 331 // JPeg 8 bit V:
 332 // b -0.08131 * 255 = -20.73405 = -20
 333 // g -0.41869 * 255 = -106.76595 = -107
 334 // r  0.50000 * 255 = 127.5 = 127
 335
 336 static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) {
 337   return (38 * r + 75 * g +  15 * b + 64) >> 7;
 338 }
 339
 340 static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) {
 341   return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
 342 }
 343 static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) {
 344   return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
 345 }
 346
 347 #define AVGB(a, b) (((a) + (b) + 1) >> 1)
 348
 349 #define MAKEROWYJ(NAME, R, G, B, BPP) \
 350 void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) {      \
 351   int x;                                                                       \
 352   for (x = 0; x < width; ++x) {                                                \
 353     dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);              \
 354     src_argb0 += BPP;                                                          \
 355     dst_y += 1;                                                                \
 356   }                                                                            \
 357 }                                                                              \
 358 void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb,             \
 359                         uint8* dst_u, uint8* dst_v, int width) {               \
 360   const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
 361   int x;                                                                       \
 362   for (x = 0; x < width - 1; x += 2) {                                         \
 363     uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                            \
 364                     AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));               \
 365     uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                            \
 366                     AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));               \
 367     uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                            \
 368                     AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));               \
 369     dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
 370     dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
 371     src_rgb0 += BPP * 2;                                                       \
 372     src_rgb1 += BPP * 2;                                                       \
 373     dst_u += 1;                                                                \
 374     dst_v += 1;                                                                \
 375   }                                                                            \
 376   if (width & 1) {                                                             \
 377     uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]);                                 \
 378     uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]);                                 \
 379     uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]);                                 \
 380     dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
 381     dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
 382   }                                                                            \
 383 }
 384
 385 MAKEROWYJ(ARGB, 2, 1, 0, 4)
 386 #undef MAKEROWYJ
 387
 388 void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
 389   int x;
 390   for (x = 0; x < width; ++x) {
 391     uint8 b = src_rgb565[0] & 0x1f;
 392     uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
 393     uint8 r = src_rgb565[1] >> 3;
 394     b = (b << 3) | (b >> 2);
 395     g = (g << 2) | (g >> 4);
 396     r = (r << 3) | (r >> 2);
 397     dst_y[0] = RGBToY(r, g, b);
 398     src_rgb565 += 2;
 399     dst_y += 1;
 400   }
 401 }
 402
 403 void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) {
 404   int x;
 405   for (x = 0; x < width; ++x) {
 406     uint8 b = src_argb1555[0] & 0x1f;
 407     uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
 408     uint8 r = (src_argb1555[1] & 0x7c) >> 2;
 409     b = (b << 3) | (b >> 2);
 410     g = (g << 3) | (g >> 2);
 411     r = (r << 3) | (r >> 2);
 412     dst_y[0] = RGBToY(r, g, b);
 413     src_argb1555 += 2;
 414     dst_y += 1;
 415   }
 416 }
 417
 418 void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) {
 419   int x;
 420   for (x = 0; x < width; ++x) {
 421     uint8 b = src_argb4444[0] & 0x0f;
 422     uint8 g = src_argb4444[0] >> 4;
 423     uint8 r = src_argb4444[1] & 0x0f;
 424     b = (b << 4) | b;
 425     g = (g << 4) | g;
 426     r = (r << 4) | r;
 427     dst_y[0] = RGBToY(r, g, b);
 428     src_argb4444 += 2;
 429     dst_y += 1;
 430   }
 431 }
 432
 433 void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
 434                      uint8* dst_u, uint8* dst_v, int width) {
 435   const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565;
 436   int x;
 437   for (x = 0; x < width - 1; x += 2) {
 438     uint8 b0 = src_rgb565[0] & 0x1f;
 439     uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
 440     uint8 r0 = src_rgb565[1] >> 3;
 441     uint8 b1 = src_rgb565[2] & 0x1f;
 442     uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
 443     uint8 r1 = src_rgb565[3] >> 3;
 444     uint8 b2 = next_rgb565[0] & 0x1f;
 445     uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
 446     uint8 r2 = next_rgb565[1] >> 3;
 447     uint8 b3 = next_rgb565[2] & 0x1f;
 448     uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
 449     uint8 r3 = next_rgb565[3] >> 3;
 450     uint8 b = (b0 + b1 + b2 + b3);  // 565 * 4 = 787.
 451     uint8 g = (g0 + g1 + g2 + g3);
 452     uint8 r = (r0 + r1 + r2 + r3);
 453     b = (b << 1) | (b >> 6);  // 787 -> 888.
 454     r = (r << 1) | (r >> 6);
 455     dst_u[0] = RGBToU(r, g, b);
 456     dst_v[0] = RGBToV(r, g, b);
 457     src_rgb565 += 4;
 458     next_rgb565 += 4;
 459     dst_u += 1;
 460     dst_v += 1;
 461   }
 462   if (width & 1) {
 463     uint8 b0 = src_rgb565[0] & 0x1f;
 464     uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
 465     uint8 r0 = src_rgb565[1] >> 3;
 466     uint8 b2 = next_rgb565[0] & 0x1f;
 467     uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
 468     uint8 r2 = next_rgb565[1] >> 3;
 469     uint8 b = (b0 + b2);  // 565 * 2 = 676.
 470     uint8 g = (g0 + g2);
 471     uint8 r = (r0 + r2);
 472     b = (b << 2) | (b >> 4);  // 676 -> 888
 473     g = (g << 1) | (g >> 6);
 474     r = (r << 2) | (r >> 4);
 475     dst_u[0] = RGBToU(r, g, b);
 476     dst_v[0] = RGBToV(r, g, b);
 477   }
 478 }
 479
 480 void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
 481                        uint8* dst_u, uint8* dst_v, int width) {
 482   const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555;
 483   int x;
 484   for (x = 0; x < width - 1; x += 2) {
 485     uint8 b0 = src_argb1555[0] & 0x1f;
 486     uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
 487     uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
 488     uint8 b1 = src_argb1555[2] & 0x1f;
 489     uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
 490     uint8 r1 = (src_argb1555[3] & 0x7c) >> 2;
 491     uint8 b2 = next_argb1555[0] & 0x1f;
 492     uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
 493     uint8 r2 = (next_argb1555[1] & 0x7c) >> 2;
 494     uint8 b3 = next_argb1555[2] & 0x1f;
 495     uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
 496     uint8 r3 = (next_argb1555[3] & 0x7c) >> 2;
 497     uint8 b = (b0 + b1 + b2 + b3);  // 555 * 4 = 777.
 498     uint8 g = (g0 + g1 + g2 + g3);
 499     uint8 r = (r0 + r1 + r2 + r3);
 500     b = (b << 1) | (b >> 6);  // 777 -> 888.
 501     g = (g << 1) | (g >> 6);
 502     r = (r << 1) | (r >> 6);
 503     dst_u[0] = RGBToU(r, g, b);
 504     dst_v[0] = RGBToV(r, g, b);
 505     src_argb1555 += 4;
 506     next_argb1555 += 4;
 507     dst_u += 1;
 508     dst_v += 1;
 509   }
 510   if (width & 1) {
 511     uint8 b0 = src_argb1555[0] & 0x1f;
 512     uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
 513     uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
 514     uint8 b2 = next_argb1555[0] & 0x1f;
 515     uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
 516     uint8 r2 = next_argb1555[1] >> 3;
 517     uint8 b = (b0 + b2);  // 555 * 2 = 666.
 518     uint8 g = (g0 + g2);
 519     uint8 r = (r0 + r2);
 520     b = (b << 2) | (b >> 4);  // 666 -> 888.
 521     g = (g << 2) | (g >> 4);
 522     r = (r << 2) | (r >> 4);
 523     dst_u[0] = RGBToU(r, g, b);
 524     dst_v[0] = RGBToV(r, g, b);
 525   }
 526 }
 527
 528 void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
 529                        uint8* dst_u, uint8* dst_v, int width) {
 530   const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444;
 531   int x;
 532   for (x = 0; x < width - 1; x += 2) {
 533     uint8 b0 = src_argb4444[0] & 0x0f;
 534     uint8 g0 = src_argb4444[0] >> 4;
 535     uint8 r0 = src_argb4444[1] & 0x0f;
 536     uint8 b1 = src_argb4444[2] & 0x0f;
 537     uint8 g1 = src_argb4444[2] >> 4;
 538     uint8 r1 = src_argb4444[3] & 0x0f;
 539     uint8 b2 = next_argb4444[0] & 0x0f;
 540     uint8 g2 = next_argb4444[0] >> 4;
 541     uint8 r2 = next_argb4444[1] & 0x0f;
 542     uint8 b3 = next_argb4444[2] & 0x0f;
 543     uint8 g3 = next_argb4444[2] >> 4;
 544     uint8 r3 = next_argb4444[3] & 0x0f;
 545     uint8 b = (b0 + b1 + b2 + b3);  // 444 * 4 = 666.
 546     uint8 g = (g0 + g1 + g2 + g3);
 547     uint8 r = (r0 + r1 + r2 + r3);
 548     b = (b << 2) | (b >> 4);  // 666 -> 888.
 549     g = (g << 2) | (g >> 4);
 550     r = (r << 2) | (r >> 4);
 551     dst_u[0] = RGBToU(r, g, b);
 552     dst_v[0] = RGBToV(r, g, b);
 553     src_argb4444 += 4;
 554     next_argb4444 += 4;
 555     dst_u += 1;
 556     dst_v += 1;
 557   }
 558   if (width & 1) {
 559     uint8 b0 = src_argb4444[0] & 0x0f;
 560     uint8 g0 = src_argb4444[0] >> 4;
 561     uint8 r0 = src_argb4444[1] & 0x0f;
 562     uint8 b2 = next_argb4444[0] & 0x0f;
 563     uint8 g2 = next_argb4444[0] >> 4;
 564     uint8 r2 = next_argb4444[1] & 0x0f;
 565     uint8 b = (b0 + b2);  // 444 * 2 = 555.
 566     uint8 g = (g0 + g2);
 567     uint8 r = (r0 + r2);
 568     b = (b << 3) | (b >> 2);  // 555 -> 888.
 569     g = (g << 3) | (g >> 2);
 570     r = (r << 3) | (r >> 2);
 571     dst_u[0] = RGBToU(r, g, b);
 572     dst_v[0] = RGBToV(r, g, b);
 573   }
 574 }
 575
 576 void ARGBToUV444Row_C(const uint8* src_argb,
 577                       uint8* dst_u, uint8* dst_v, int width) {
 578   int x;
 579   for (x = 0; x < width; ++x) {
 580     uint8 ab = src_argb[0];
 581     uint8 ag = src_argb[1];
 582     uint8 ar = src_argb[2];
 583     dst_u[0] = RGBToU(ar, ag, ab);
 584     dst_v[0] = RGBToV(ar, ag, ab);
 585     src_argb += 4;
 586     dst_u += 1;
 587     dst_v += 1;
 588   }
 589 }
 590
 591 void ARGBToUV422Row_C(const uint8* src_argb,
 592                       uint8* dst_u, uint8* dst_v, int width) {
 593   int x;
 594   for (x = 0; x < width - 1; x += 2) {
 595     uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
 596     uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
 597     uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
 598     dst_u[0] = RGBToU(ar, ag, ab);
 599     dst_v[0] = RGBToV(ar, ag, ab);
 600     src_argb += 8;
 601     dst_u += 1;
 602     dst_v += 1;
 603   }
 604   if (width & 1) {
 605     uint8 ab = src_argb[0];
 606     uint8 ag = src_argb[1];
 607     uint8 ar = src_argb[2];
 608     dst_u[0] = RGBToU(ar, ag, ab);
 609     dst_v[0] = RGBToV(ar, ag, ab);
 610   }
 611 }
 612
 613 void ARGBToUV411Row_C(const uint8* src_argb,
 614                       uint8* dst_u, uint8* dst_v, int width) {
 615   int x;
 616   for (x = 0; x < width - 3; x += 4) {
 617     uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2;
 618     uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2;
 619     uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2;
 620     dst_u[0] = RGBToU(ar, ag, ab);
 621     dst_v[0] = RGBToV(ar, ag, ab);
 622     src_argb += 16;
 623     dst_u += 1;
 624     dst_v += 1;
 625   }
 626   if ((width & 3) == 3) {
 627     uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8]) / 3;
 628     uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9]) / 3;
 629     uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10]) / 3;
 630     dst_u[0] = RGBToU(ar, ag, ab);
 631     dst_v[0] = RGBToV(ar, ag, ab);
 632   } else if ((width & 3) == 2) {
 633     uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
 634     uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
 635     uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
 636     dst_u[0] = RGBToU(ar, ag, ab);
 637     dst_v[0] = RGBToV(ar, ag, ab);
 638   } else if ((width & 3) == 1) {
 639     uint8 ab = src_argb[0];
 640     uint8 ag = src_argb[1];
 641     uint8 ar = src_argb[2];
 642     dst_u[0] = RGBToU(ar, ag, ab);
 643     dst_v[0] = RGBToV(ar, ag, ab);
 644   }
 645 }
 646
 647 void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
 648   int x;
 649   for (x = 0; x < width; ++x) {
 650     uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
 651     dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
 652     dst_argb[3] = src_argb[3];
 653     dst_argb += 4;
 654     src_argb += 4;
 655   }
 656 }
 657
 658 // Convert a row of image to Sepia tone.
 659 void ARGBSepiaRow_C(uint8* dst_argb, int width) {
 660   int x;
 661   for (x = 0; x < width; ++x) {
 662     int b = dst_argb[0];
 663     int g = dst_argb[1];
 664     int r = dst_argb[2];
 665     int sb = (b * 17 + g * 68 + r * 35) >> 7;
 666     int sg = (b * 22 + g * 88 + r * 45) >> 7;
 667     int sr = (b * 24 + g * 98 + r * 50) >> 7;
 668     // b does not over flow. a is preserved from original.
 669     dst_argb[0] = sb;
 670     dst_argb[1] = clamp255(sg);
 671     dst_argb[2] = clamp255(sr);
 672     dst_argb += 4;
 673   }
 674 }
 675
 676 // Apply color matrix to a row of image. Matrix is signed.
 677 // TODO(fbarchard): Consider adding rounding (+32).
 678 void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
 679                           const int8* matrix_argb, int width) {
 680   int x;
 681   for (x = 0; x < width; ++x) {
 682     int b = src_argb[0];
 683     int g = src_argb[1];
 684     int r = src_argb[2];
 685     int a = src_argb[3];
 686     int sb = (b * matrix_argb[0] + g * matrix_argb[1] +
 687               r * matrix_argb[2] + a * matrix_argb[3]) >> 6;
 688     int sg = (b * matrix_argb[4] + g * matrix_argb[5] +
 689               r * matrix_argb[6] + a * matrix_argb[7]) >> 6;
 690     int sr = (b * matrix_argb[8] + g * matrix_argb[9] +
 691               r * matrix_argb[10] + a * matrix_argb[11]) >> 6;
 692     int sa = (b * matrix_argb[12] + g * matrix_argb[13] +
 693               r * matrix_argb[14] + a * matrix_argb[15]) >> 6;
 694     dst_argb[0] = Clamp(sb);
 695     dst_argb[1] = Clamp(sg);
 696     dst_argb[2] = Clamp(sr);
 697     dst_argb[3] = Clamp(sa);
 698     src_argb += 4;
 699     dst_argb += 4;
 700   }
 701 }
 702
 703 // Apply color table to a row of image.
 704 void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
 705   int x;
 706   for (x = 0; x < width; ++x) {
 707     int b = dst_argb[0];
 708     int g = dst_argb[1];
 709     int r = dst_argb[2];
 710     int a = dst_argb[3];
 711     dst_argb[0] = table_argb[b * 4 + 0];
 712     dst_argb[1] = table_argb[g * 4 + 1];
 713     dst_argb[2] = table_argb[r * 4 + 2];
 714     dst_argb[3] = table_argb[a * 4 + 3];
 715     dst_argb += 4;
 716   }
 717 }
 718
 719 // Apply color table to a row of image.
 720 void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
 721   int x;
 722   for (x = 0; x < width; ++x) {
 723     int b = dst_argb[0];
 724     int g = dst_argb[1];
 725     int r = dst_argb[2];
 726     dst_argb[0] = table_argb[b * 4 + 0];
 727     dst_argb[1] = table_argb[g * 4 + 1];
 728     dst_argb[2] = table_argb[r * 4 + 2];
 729     dst_argb += 4;
 730   }
 731 }
 732
 733 void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
 734                        int interval_offset, int width) {
 735   int x;
 736   for (x = 0; x < width; ++x) {
 737     int b = dst_argb[0];
 738     int g = dst_argb[1];
 739     int r = dst_argb[2];
 740     dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
 741     dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;
 742     dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;
 743     dst_argb += 4;
 744   }
 745 }
 746
 747 #define REPEAT8(v) (v) | ((v) << 8)
 748 #define SHADE(f, v) v * f >> 24
 749
 750 void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
 751                     uint32 value) {
 752   const uint32 b_scale = REPEAT8(value & 0xff);
 753   const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
 754   const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
 755   const uint32 a_scale = REPEAT8(value >> 24);
 756
 757   int i;
 758   for (i = 0; i < width; ++i) {
 759     const uint32 b = REPEAT8(src_argb[0]);
 760     const uint32 g = REPEAT8(src_argb[1]);
 761     const uint32 r = REPEAT8(src_argb[2]);
 762     const uint32 a = REPEAT8(src_argb[3]);
 763     dst_argb[0] = SHADE(b, b_scale);
 764     dst_argb[1] = SHADE(g, g_scale);
 765     dst_argb[2] = SHADE(r, r_scale);
 766     dst_argb[3] = SHADE(a, a_scale);
 767     src_argb += 4;
 768     dst_argb += 4;
 769   }
 770 }
 771 #undef REPEAT8
 772 #undef SHADE
 773
 774 #define REPEAT8(v) (v) | ((v) << 8)
 775 #define SHADE(f, v) v * f >> 16
 776
 777 void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
 778                        uint8* dst_argb, int width) {
 779   int i;
 780   for (i = 0; i < width; ++i) {
 781     const uint32 b = REPEAT8(src_argb0[0]);
 782     const uint32 g = REPEAT8(src_argb0[1]);
 783     const uint32 r = REPEAT8(src_argb0[2]);
 784     const uint32 a = REPEAT8(src_argb0[3]);
 785     const uint32 b_scale = src_argb1[0];
 786     const uint32 g_scale = src_argb1[1];
 787     const uint32 r_scale = src_argb1[2];
 788     const uint32 a_scale = src_argb1[3];
 789     dst_argb[0] = SHADE(b, b_scale);
 790     dst_argb[1] = SHADE(g, g_scale);
 791     dst_argb[2] = SHADE(r, r_scale);
 792     dst_argb[3] = SHADE(a, a_scale);
 793     src_argb0 += 4;
 794     src_argb1 += 4;
 795     dst_argb += 4;
 796   }
 797 }
 798 #undef REPEAT8
 799 #undef SHADE
 800
 801 #define SHADE(f, v) clamp255(v + f)
 802
 803 void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,
 804                   uint8* dst_argb, int width) {
 805   int i;
 806   for (i = 0; i < width; ++i) {
 807     const int b = src_argb0[0];
 808     const int g = src_argb0[1];
 809     const int r = src_argb0[2];
 810     const int a = src_argb0[3];
 811     const int b_add = src_argb1[0];
 812     const int g_add = src_argb1[1];
 813     const int r_add = src_argb1[2];
 814     const int a_add = src_argb1[3];
 815     dst_argb[0] = SHADE(b, b_add);
 816     dst_argb[1] = SHADE(g, g_add);
 817     dst_argb[2] = SHADE(r, r_add);
 818     dst_argb[3] = SHADE(a, a_add);
 819     src_argb0 += 4;
 820     src_argb1 += 4;
 821     dst_argb += 4;
 822   }
 823 }
 824 #undef SHADE
 825
 826 #define SHADE(f, v) clamp0(f - v)
 827
 828 void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1,
 829                        uint8* dst_argb, int width) {
 830   int i;
 831   for (i = 0; i < width; ++i) {
 832     const int b = src_argb0[0];
 833     const int g = src_argb0[1];
 834     const int r = src_argb0[2];
 835     const int a = src_argb0[3];
 836     const int b_sub = src_argb1[0];
 837     const int g_sub = src_argb1[1];
 838     const int r_sub = src_argb1[2];
 839     const int a_sub = src_argb1[3];
 840     dst_argb[0] = SHADE(b, b_sub);
 841     dst_argb[1] = SHADE(g, g_sub);
 842     dst_argb[2] = SHADE(r, r_sub);
 843     dst_argb[3] = SHADE(a, a_sub);
 844     src_argb0 += 4;
 845     src_argb1 += 4;
 846     dst_argb += 4;
 847   }
 848 }
 849 #undef SHADE
 850
 851 // Sobel functions which mimics SSSE3.
 852 void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
 853                  uint8* dst_sobelx, int width) {
 854   int i;
 855   for (i = 0; i < width; ++i) {
 856     int a = src_y0[i];
 857     int b = src_y1[i];
 858     int c = src_y2[i];
 859     int a_sub = src_y0[i + 2];
 860     int b_sub = src_y1[i + 2];
 861     int c_sub = src_y2[i + 2];
 862     int a_diff = a - a_sub;
 863     int b_diff = b - b_sub;
 864     int c_diff = c - c_sub;
 865     int sobel = Abs(a_diff + b_diff * 2 + c_diff);
 866     dst_sobelx[i] = (uint8)(clamp255(sobel));
 867   }
 868 }
 869
 870 void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
 871                  uint8* dst_sobely, int width) {
 872   int i;
 873   for (i = 0; i < width; ++i) {
 874     int a = src_y0[i + 0];
 875     int b = src_y0[i + 1];
 876     int c = src_y0[i + 2];
 877     int a_sub = src_y1[i + 0];
 878     int b_sub = src_y1[i + 1];
 879     int c_sub = src_y1[i + 2];
 880     int a_diff = a - a_sub;
 881     int b_diff = b - b_sub;
 882     int c_diff = c - c_sub;
 883     int sobel = Abs(a_diff + b_diff * 2 + c_diff);
 884     dst_sobely[i] = (uint8)(clamp255(sobel));
 885   }
 886 }
 887
 888 void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
 889                 uint8* dst_argb, int width) {
 890   int i;
 891   for (i = 0; i < width; ++i) {
 892     int r = src_sobelx[i];
 893     int b = src_sobely[i];
 894     int s = clamp255(r + b);
 895     dst_argb[0] = (uint8)(s);
 896     dst_argb[1] = (uint8)(s);
 897     dst_argb[2] = (uint8)(s);
 898     dst_argb[3] = (uint8)(255u);
 899     dst_argb += 4;
 900   }
 901 }
 902
 903 void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
 904                        uint8* dst_y, int width) {
 905   int i;
 906   for (i = 0; i < width; ++i) {
 907     int r = src_sobelx[i];
 908     int b = src_sobely[i];
 909     int s = clamp255(r + b);
 910     dst_y[i] = (uint8)(s);
 911   }
 912 }
 913
 914 void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
 915                   uint8* dst_argb, int width) {
 916   int i;
 917   for (i = 0; i < width; ++i) {
 918     int r = src_sobelx[i];
 919     int b = src_sobely[i];
 920     int g = clamp255(r + b);
 921     dst_argb[0] = (uint8)(b);
 922     dst_argb[1] = (uint8)(g);
 923     dst_argb[2] = (uint8)(r);
 924     dst_argb[3] = (uint8)(255u);
 925     dst_argb += 4;
 926   }
 927 }
 928
 929 void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
 930   // Copy a Y to RGB.
 931   int x;
 932   for (x = 0; x < width; ++x) {
 933     uint8 y = src_y[0];
 934     dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
 935     dst_argb[3] = 255u;
 936     dst_argb += 4;
 937     ++src_y;
 938   }
 939 }
 940
 941 // C reference code that mimics the YUV assembly.
 942
 943 #define YG 74 /* (int8)(1.164 * 64 + 0.5) */
 944
 945 #define UB 127 /* min(63,(int8)(2.018 * 64)) */
 946 #define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
 947 #define UR 0
 948
 949 #define VB 0
 950 #define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
 951 #define VR 102 /* (int8)(1.596 * 64 + 0.5) */
 952
 953 // Bias
 954 #define BB UB * 128 + VB * 128
 955 #define BG UG * 128 + VG * 128
 956 #define BR UR * 128 + VR * 128
 957
 958 static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
 959                               uint8* b, uint8* g, uint8* r) {
 960   int32 y1 = ((int32)(y) - 16) * YG;
 961   *b = Clamp((int32)((u * UB + v * VB) - (BB) + y1) >> 6);
 962   *g = Clamp((int32)((u * UG + v * VG) - (BG) + y1) >> 6);
 963   *r = Clamp((int32)((u * UR + v * VR) - (BR) + y1) >> 6);
 964 }
 965
 966 #if !defined(LIBYUV_DISABLE_NEON) && \
 967     (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
 968 // C mimic assembly.
 969 // TODO(fbarchard): Remove subsampling from Neon.
 970 void I444ToARGBRow_C(const uint8* src_y,
 971                      const uint8* src_u,
 972                      const uint8* src_v,
 973                      uint8* rgb_buf,
 974                      int width) {
 975   int x;
 976   for (x = 0; x < width - 1; x += 2) {
 977     uint8 u = (src_u[0] + src_u[1] + 1) >> 1;
 978     uint8 v = (src_v[0] + src_v[1] + 1) >> 1;
 979     YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
 980     rgb_buf[3] = 255;
 981     YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
 982     rgb_buf[7] = 255;
 983     src_y += 2;
 984     src_u += 2;
 985     src_v += 2;
 986     rgb_buf += 8;  // Advance 2 pixels.
 987   }
 988   if (width & 1) {
 989     YuvPixel(src_y[0], src_u[0], src_v[0],
 990              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
 991   }
 992 }
 993 #else
 994 void I444ToARGBRow_C(const uint8* src_y,
 995                      const uint8* src_u,
 996                      const uint8* src_v,
 997                      uint8* rgb_buf,
 998                      int width) {
 999   int x;
1000   for (x = 0; x < width; ++x) {
1001     YuvPixel(src_y[0], src_u[0], src_v[0],
1002              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1003     rgb_buf[3] = 255;
1004     src_y += 1;
1005     src_u += 1;
1006     src_v += 1;
1007     rgb_buf += 4;  // Advance 1 pixel.
1008   }
1009 }
1010 #endif
1011 // Also used for 420
1012 void I422ToARGBRow_C(const uint8* src_y,
1013                      const uint8* src_u,
1014                      const uint8* src_v,
1015                      uint8* rgb_buf,
1016                      int width) {
1017   int x;
1018   for (x = 0; x < width - 1; x += 2) {
1019     YuvPixel(src_y[0], src_u[0], src_v[0],
1020              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1021     rgb_buf[3] = 255;
1022     YuvPixel(src_y[1], src_u[0], src_v[0],
1023              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1024     rgb_buf[7] = 255;
1025     src_y += 2;
1026     src_u += 1;
1027     src_v += 1;
1028     rgb_buf += 8;  // Advance 2 pixels.
1029   }
1030   if (width & 1) {
1031     YuvPixel(src_y[0], src_u[0], src_v[0],
1032              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1033     rgb_buf[3] = 255;
1034   }
1035 }
1036
1037 void I422ToRGB24Row_C(const uint8* src_y,
1038                       const uint8* src_u,
1039                       const uint8* src_v,
1040                       uint8* rgb_buf,
1041                       int width) {
1042   int x;
1043   for (x = 0; x < width - 1; x += 2) {
1044     YuvPixel(src_y[0], src_u[0], src_v[0],
1045              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1046     YuvPixel(src_y[1], src_u[0], src_v[0],
1047              rgb_buf + 3, rgb_buf + 4, rgb_buf + 5);
1048     src_y += 2;
1049     src_u += 1;
1050     src_v += 1;
1051     rgb_buf += 6;  // Advance 2 pixels.
1052   }
1053   if (width & 1) {
1054     YuvPixel(src_y[0], src_u[0], src_v[0],
1055              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1056   }
1057 }
1058
1059 void I422ToRAWRow_C(const uint8* src_y,
1060                     const uint8* src_u,
1061                     const uint8* src_v,
1062                     uint8* rgb_buf,
1063                     int width) {
1064   int x;
1065   for (x = 0; x < width - 1; x += 2) {
1066     YuvPixel(src_y[0], src_u[0], src_v[0],
1067              rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
1068     YuvPixel(src_y[1], src_u[0], src_v[0],
1069              rgb_buf + 5, rgb_buf + 4, rgb_buf + 3);
1070     src_y += 2;
1071     src_u += 1;
1072     src_v += 1;
1073     rgb_buf += 6;  // Advance 2 pixels.
1074   }
1075   if (width & 1) {
1076     YuvPixel(src_y[0], src_u[0], src_v[0],
1077              rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
1078   }
1079 }
1080
1081 void I422ToARGB4444Row_C(const uint8* src_y,
1082                          const uint8* src_u,
1083                          const uint8* src_v,
1084                          uint8* dst_argb4444,
1085                          int width) {
1086   uint8 b0;
1087   uint8 g0;
1088   uint8 r0;
1089   uint8 b1;
1090   uint8 g1;
1091   uint8 r1;
1092   int x;
1093   for (x = 0; x < width - 1; x += 2) {
1094     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
1095     YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
1096     b0 = b0 >> 4;
1097     g0 = g0 >> 4;
1098     r0 = r0 >> 4;
1099     b1 = b1 >> 4;
1100     g1 = g1 >> 4;
1101     r1 = r1 >> 4;
1102     *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
1103         (b1 << 16) | (g1 << 20) | (r1 << 24) | 0xf000f000;
1104     src_y += 2;
1105     src_u += 1;
1106     src_v += 1;
1107     dst_argb4444 += 4;  // Advance 2 pixels.
1108   }
1109   if (width & 1) {
1110     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
1111     b0 = b0 >> 4;
1112     g0 = g0 >> 4;
1113     r0 = r0 >> 4;
1114     *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
1115         0xf000;
1116   }
1117 }
1118
1119 void I422ToARGB1555Row_C(const uint8* src_y,
1120                          const uint8* src_u,
1121                          const uint8* src_v,
1122                          uint8* dst_argb1555,
1123                          int width) {
1124   uint8 b0;
1125   uint8 g0;
1126   uint8 r0;
1127   uint8 b1;
1128   uint8 g1;
1129   uint8 r1;
1130   int x;
1131   for (x = 0; x < width - 1; x += 2) {
1132     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
1133     YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
1134     b0 = b0 >> 3;
1135     g0 = g0 >> 3;
1136     r0 = r0 >> 3;
1137     b1 = b1 >> 3;
1138     g1 = g1 >> 3;
1139     r1 = r1 >> 3;
1140     *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
1141         (b1 << 16) | (g1 << 21) | (r1 << 26) | 0x80008000;
1142     src_y += 2;
1143     src_u += 1;
1144     src_v += 1;
1145     dst_argb1555 += 4;  // Advance 2 pixels.
1146   }
1147   if (width & 1) {
1148     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
1149     b0 = b0 >> 3;
1150     g0 = g0 >> 3;
1151     r0 = r0 >> 3;
1152     *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
1153         0x8000;
1154   }
1155 }
1156
1157 void I422ToRGB565Row_C(const uint8* src_y,
1158                        const uint8* src_u,
1159                        const uint8* src_v,
1160                        uint8* dst_rgb565,
1161                        int width) {
1162   uint8 b0;
1163   uint8 g0;
1164   uint8 r0;
1165   uint8 b1;
1166   uint8 g1;
1167   uint8 r1;
1168   int x;
1169   for (x = 0; x < width - 1; x += 2) {
1170     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
1171     YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
1172     b0 = b0 >> 3;
1173     g0 = g0 >> 2;
1174     r0 = r0 >> 3;
1175     b1 = b1 >> 3;
1176     g1 = g1 >> 2;
1177     r1 = r1 >> 3;
1178     *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
1179         (b1 << 16) | (g1 << 21) | (r1 << 27);
1180     src_y += 2;
1181     src_u += 1;
1182     src_v += 1;
1183     dst_rgb565 += 4;  // Advance 2 pixels.
1184   }
1185   if (width & 1) {
1186     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
1187     b0 = b0 >> 3;
1188     g0 = g0 >> 2;
1189     r0 = r0 >> 3;
1190     *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
1191   }
1192 }
1193
1194 void I411ToARGBRow_C(const uint8* src_y,
1195                      const uint8* src_u,
1196                      const uint8* src_v,
1197                      uint8* rgb_buf,
1198                      int width) {
1199   int x;
1200   for (x = 0; x < width - 3; x += 4) {
1201     YuvPixel(src_y[0], src_u[0], src_v[0],
1202              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1203     rgb_buf[3] = 255;
1204     YuvPixel(src_y[1], src_u[0], src_v[0],
1205              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1206     rgb_buf[7] = 255;
1207     YuvPixel(src_y[2], src_u[0], src_v[0],
1208              rgb_buf + 8, rgb_buf + 9, rgb_buf + 10);
1209     rgb_buf[11] = 255;
1210     YuvPixel(src_y[3], src_u[0], src_v[0],
1211              rgb_buf + 12, rgb_buf + 13, rgb_buf + 14);
1212     rgb_buf[15] = 255;
1213     src_y += 4;
1214     src_u += 1;
1215     src_v += 1;
1216     rgb_buf += 16;  // Advance 4 pixels.
1217   }
1218   if (width & 2) {
1219     YuvPixel(src_y[0], src_u[0], src_v[0],
1220              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1221     rgb_buf[3] = 255;
1222     YuvPixel(src_y[1], src_u[0], src_v[0],
1223              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1224     rgb_buf[7] = 255;
1225     src_y += 2;
1226     rgb_buf += 8;  // Advance 2 pixels.
1227   }
1228   if (width & 1) {
1229     YuvPixel(src_y[0], src_u[0], src_v[0],
1230              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1231     rgb_buf[3] = 255;
1232   }
1233 }
1234
1235 void NV12ToARGBRow_C(const uint8* src_y,
1236                      const uint8* usrc_v,
1237                      uint8* rgb_buf,
1238                      int width) {
1239   int x;
1240   for (x = 0; x < width - 1; x += 2) {
1241     YuvPixel(src_y[0], usrc_v[0], usrc_v[1],
1242              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1243     rgb_buf[3] = 255;
1244     YuvPixel(src_y[1], usrc_v[0], usrc_v[1],
1245              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1246     rgb_buf[7] = 255;
1247     src_y += 2;
1248     usrc_v += 2;
1249     rgb_buf += 8;  // Advance 2 pixels.
1250   }
1251   if (width & 1) {
1252     YuvPixel(src_y[0], usrc_v[0], usrc_v[1],
1253              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1254     rgb_buf[3] = 255;
1255   }
1256 }
1257
1258 void NV21ToARGBRow_C(const uint8* src_y,
1259                      const uint8* src_vu,
1260                      uint8* rgb_buf,
1261                      int width) {
1262   int x;
1263   for (x = 0; x < width - 1; x += 2) {
1264     YuvPixel(src_y[0], src_vu[1], src_vu[0],
1265              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1266     rgb_buf[3] = 255;
1267
1268     YuvPixel(src_y[1], src_vu[1], src_vu[0],
1269              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1270     rgb_buf[7] = 255;
1271
1272     src_y += 2;
1273     src_vu += 2;
1274     rgb_buf += 8;  // Advance 2 pixels.
1275   }
1276   if (width & 1) {
1277     YuvPixel(src_y[0], src_vu[1], src_vu[0],
1278              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1279     rgb_buf[3] = 255;
1280   }
1281 }
1282
1283 void NV12ToRGB565Row_C(const uint8* src_y,
1284                        const uint8* usrc_v,
1285                        uint8* dst_rgb565,
1286                        int width) {
1287   uint8 b0;
1288   uint8 g0;
1289   uint8 r0;
1290   uint8 b1;
1291   uint8 g1;
1292   uint8 r1;
1293   int x;
1294   for (x = 0; x < width - 1; x += 2) {
1295     YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0);
1296     YuvPixel(src_y[1], usrc_v[0], usrc_v[1], &b1, &g1, &r1);
1297     b0 = b0 >> 3;
1298     g0 = g0 >> 2;
1299     r0 = r0 >> 3;
1300     b1 = b1 >> 3;
1301     g1 = g1 >> 2;
1302     r1 = r1 >> 3;
1303     *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
1304         (b1 << 16) | (g1 << 21) | (r1 << 27);
1305     src_y += 2;
1306     usrc_v += 2;
1307     dst_rgb565 += 4;  // Advance 2 pixels.
1308   }
1309   if (width & 1) {
1310     YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0);
1311     b0 = b0 >> 3;
1312     g0 = g0 >> 2;
1313     r0 = r0 >> 3;
1314     *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
1315   }
1316 }
1317
1318 void NV21ToRGB565Row_C(const uint8* src_y,
1319                        const uint8* vsrc_u,
1320                        uint8* dst_rgb565,
1321                        int width) {
1322   uint8 b0;
1323   uint8 g0;
1324   uint8 r0;
1325   uint8 b1;
1326   uint8 g1;
1327   uint8 r1;
1328   int x;
1329   for (x = 0; x < width - 1; x += 2) {
1330     YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);
1331     YuvPixel(src_y[1], vsrc_u[1], vsrc_u[0], &b1, &g1, &r1);
1332     b0 = b0 >> 3;
1333     g0 = g0 >> 2;
1334     r0 = r0 >> 3;
1335     b1 = b1 >> 3;
1336     g1 = g1 >> 2;
1337     r1 = r1 >> 3;
1338     *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
1339         (b1 << 16) | (g1 << 21) | (r1 << 27);
1340     src_y += 2;
1341     vsrc_u += 2;
1342     dst_rgb565 += 4;  // Advance 2 pixels.
1343   }
1344   if (width & 1) {
1345     YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);
1346     b0 = b0 >> 3;
1347     g0 = g0 >> 2;
1348     r0 = r0 >> 3;
1349     *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
1350   }
1351 }
1352
1353 void YUY2ToARGBRow_C(const uint8* src_yuy2,
1354                      uint8* rgb_buf,
1355                      int width) {
1356   int x;
1357   for (x = 0; x < width - 1; x += 2) {
1358     YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
1359              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1360     rgb_buf[3] = 255;
1361     YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3],
1362              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1363     rgb_buf[7] = 255;
1364     src_yuy2 += 4;
1365     rgb_buf += 8;  // Advance 2 pixels.
1366   }
1367   if (width & 1) {
1368     YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
1369              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1370     rgb_buf[3] = 255;
1371   }
1372 }
1373
1374 void UYVYToARGBRow_C(const uint8* src_uyvy,
1375                      uint8* rgb_buf,
1376                      int width) {
1377   int x;
1378   for (x = 0; x < width - 1; x += 2) {
1379     YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
1380              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1381     rgb_buf[3] = 255;
1382     YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2],
1383              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1384     rgb_buf[7] = 255;
1385     src_uyvy += 4;
1386     rgb_buf += 8;  // Advance 2 pixels.
1387   }
1388   if (width & 1) {
1389     YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
1390              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1391     rgb_buf[3] = 255;
1392   }
1393 }
1394
1395 void I422ToBGRARow_C(const uint8* src_y,
1396                      const uint8* src_u,
1397                      const uint8* src_v,
1398                      uint8* rgb_buf,
1399                      int width) {
1400   int x;
1401   for (x = 0; x < width - 1; x += 2) {
1402     YuvPixel(src_y[0], src_u[0], src_v[0],
1403              rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);
1404     rgb_buf[0] = 255;
1405     YuvPixel(src_y[1], src_u[0], src_v[0],
1406              rgb_buf + 7, rgb_buf + 6, rgb_buf + 5);
1407     rgb_buf[4] = 255;
1408     src_y += 2;
1409     src_u += 1;
1410     src_v += 1;
1411     rgb_buf += 8;  // Advance 2 pixels.
1412   }
1413   if (width & 1) {
1414     YuvPixel(src_y[0], src_u[0], src_v[0],
1415              rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);
1416     rgb_buf[0] = 255;
1417   }
1418 }
1419
1420 void I422ToABGRRow_C(const uint8* src_y,
1421                      const uint8* src_u,
1422                      const uint8* src_v,
1423                      uint8* rgb_buf,
1424                      int width) {
1425   int x;
1426   for (x = 0; x < width - 1; x += 2) {
1427     YuvPixel(src_y[0], src_u[0], src_v[0],
1428              rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
1429     rgb_buf[3] = 255;
1430     YuvPixel(src_y[1], src_u[0], src_v[0],
1431              rgb_buf + 6, rgb_buf + 5, rgb_buf + 4);
1432     rgb_buf[7] = 255;
1433     src_y += 2;
1434     src_u += 1;
1435     src_v += 1;
1436     rgb_buf += 8;  // Advance 2 pixels.
1437   }
1438   if (width & 1) {
1439     YuvPixel(src_y[0], src_u[0], src_v[0],
1440              rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
1441     rgb_buf[3] = 255;
1442   }
1443 }
1444
1445 void I422ToRGBARow_C(const uint8* src_y,
1446                      const uint8* src_u,
1447                      const uint8* src_v,
1448                      uint8* rgb_buf,
1449                      int width) {
1450   int x;
1451   for (x = 0; x < width - 1; x += 2) {
1452     YuvPixel(src_y[0], src_u[0], src_v[0],
1453              rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);
1454     rgb_buf[0] = 255;
1455     YuvPixel(src_y[1], src_u[0], src_v[0],
1456              rgb_buf + 5, rgb_buf + 6, rgb_buf + 7);
1457     rgb_buf[4] = 255;
1458     src_y += 2;
1459     src_u += 1;
1460     src_v += 1;
1461     rgb_buf += 8;  // Advance 2 pixels.
1462   }
1463   if (width & 1) {
1464     YuvPixel(src_y[0], src_u[0], src_v[0],
1465              rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);
1466     rgb_buf[0] = 255;
1467   }
1468 }
1469
1470 void YToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
1471   int x;
1472   for (x = 0; x < width - 1; x += 2) {
1473     YuvPixel(src_y[0], 128, 128,
1474              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1475     rgb_buf[3] = 255;
1476     YuvPixel(src_y[1], 128, 128,
1477              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
1478     rgb_buf[7] = 255;
1479     src_y += 2;
1480     rgb_buf += 8;  // Advance 2 pixels.
1481   }
1482   if (width & 1) {
1483     YuvPixel(src_y[0], 128, 128,
1484              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
1485     rgb_buf[3] = 255;
1486   }
1487 }
1488
1489 void MirrorRow_C(const uint8* src, uint8* dst, int width) {
1490   int x;
1491   src += width - 1;
1492   for (x = 0; x < width - 1; x += 2) {
1493     dst[x] = src[0];
1494     dst[x + 1] = src[-1];
1495     src -= 2;
1496   }
1497   if (width & 1) {
1498     dst[width - 1] = src[0];
1499   }
1500 }
1501
1502 void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
1503   int x;
1504   src_uv += (width - 1) << 1;
1505   for (x = 0; x < width - 1; x += 2) {
1506     dst_u[x] = src_uv[0];
1507     dst_u[x + 1] = src_uv[-2];
1508     dst_v[x] = src_uv[1];
1509     dst_v[x + 1] = src_uv[-2 + 1];
1510     src_uv -= 4;
1511   }
1512   if (width & 1) {
1513     dst_u[width - 1] = src_uv[0];
1514     dst_v[width - 1] = src_uv[1];
1515   }
1516 }
1517
1518 void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {
1519   int x;
1520   const uint32* src32 = (const uint32*)(src);
1521   uint32* dst32 = (uint32*)(dst);
1522   src32 += width - 1;
1523   for (x = 0; x < width - 1; x += 2) {
1524     dst32[x] = src32[0];
1525     dst32[x + 1] = src32[-1];
1526     src32 -= 2;
1527   }
1528   if (width & 1) {
1529     dst32[width - 1] = src32[0];
1530   }
1531 }
1532
1533 void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
1534   int x;
1535   for (x = 0; x < width - 1; x += 2) {
1536     dst_u[x] = src_uv[0];
1537     dst_u[x + 1] = src_uv[2];
1538     dst_v[x] = src_uv[1];
1539     dst_v[x + 1] = src_uv[3];
1540     src_uv += 4;
1541   }
1542   if (width & 1) {
1543     dst_u[width - 1] = src_uv[0];
1544     dst_v[width - 1] = src_uv[1];
1545   }
1546 }
1547
1548 void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
1549                   int width) {
1550   int x;
1551   for (x = 0; x < width - 1; x += 2) {
1552     dst_uv[0] = src_u[x];
1553     dst_uv[1] = src_v[x];
1554     dst_uv[2] = src_u[x + 1];
1555     dst_uv[3] = src_v[x + 1];
1556     dst_uv += 4;
1557   }
1558   if (width & 1) {
1559     dst_uv[0] = src_u[width - 1];
1560     dst_uv[1] = src_v[width - 1];
1561   }
1562 }
1563
1564 void CopyRow_C(const uint8* src, uint8* dst, int count) {
1565   memcpy(dst, src, count);
1566 }
1567
1568 void CopyRow_16_C(const uint16* src, uint16* dst, int count) {
1569   memcpy(dst, src, count * 2);
1570 }
1571
1572 void SetRow_C(uint8* dst, uint32 v8, int count) {
1573 #ifdef _MSC_VER
1574   // VC will generate rep stosb.
1575   int x;
1576   for (x = 0; x < count; ++x) {
1577     dst[x] = v8;
1578   }
1579 #else
1580   memset(dst, v8, count);
1581 #endif
1582 }
1583
1584 void ARGBSetRows_C(uint8* dst, uint32 v32, int width,
1585                  int dst_stride, int height) {
1586   int y;
1587   for (y = 0; y < height; ++y) {
1588     uint32* d = (uint32*)(dst);
1589     int x;
1590     for (x = 0; x < width; ++x) {
1591       d[x] = v32;
1592     }
1593     dst += dst_stride;
1594   }
1595 }
1596
1597 // Filter 2 rows of YUY2 UV's (422) into U and V (420).
1598 void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
1599                    uint8* dst_u, uint8* dst_v, int width) {
1600   // Output a row of UV values, filtering 2 rows of YUY2.
1601   int x;
1602   for (x = 0; x < width; x += 2) {
1603     dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
1604     dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
1605     src_yuy2 += 4;
1606     dst_u += 1;
1607     dst_v += 1;
1608   }
1609 }
1610
1611 // Copy row of YUY2 UV's (422) into U and V (422).
1612 void YUY2ToUV422Row_C(const uint8* src_yuy2,
1613                       uint8* dst_u, uint8* dst_v, int width) {
1614   // Output a row of UV values.
1615   int x;
1616   for (x = 0; x < width; x += 2) {
1617     dst_u[0] = src_yuy2[1];
1618     dst_v[0] = src_yuy2[3];
1619     src_yuy2 += 4;
1620     dst_u += 1;
1621     dst_v += 1;
1622   }
1623 }
1624
1625 // Copy row of YUY2 Y's (422) into Y (420/422).
1626 void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
1627   // Output a row of Y values.
1628   int x;
1629   for (x = 0; x < width - 1; x += 2) {
1630     dst_y[x] = src_yuy2[0];
1631     dst_y[x + 1] = src_yuy2[2];
1632     src_yuy2 += 4;
1633   }
1634   if (width & 1) {
1635     dst_y[width - 1] = src_yuy2[0];
1636   }
1637 }
1638
1639 // Filter 2 rows of UYVY UV's (422) into U and V (420).
1640 void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
1641                    uint8* dst_u, uint8* dst_v, int width) {
1642   // Output a row of UV values.
1643   int x;
1644   for (x = 0; x < width; x += 2) {
1645     dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
1646     dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
1647     src_uyvy += 4;
1648     dst_u += 1;
1649     dst_v += 1;
1650   }
1651 }
1652
1653 // Copy row of UYVY UV's (422) into U and V (422).
1654 void UYVYToUV422Row_C(const uint8* src_uyvy,
1655                       uint8* dst_u, uint8* dst_v, int width) {
1656   // Output a row of UV values.
1657   int x;
1658   for (x = 0; x < width; x += 2) {
1659     dst_u[0] = src_uyvy[0];
1660     dst_v[0] = src_uyvy[2];
1661     src_uyvy += 4;
1662     dst_u += 1;
1663     dst_v += 1;
1664   }
1665 }
1666
1667 // Copy row of UYVY Y's (422) into Y (420/422).
1668 void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
1669   // Output a row of Y values.
1670   int x;
1671   for (x = 0; x < width - 1; x += 2) {
1672     dst_y[x] = src_uyvy[1];
1673     dst_y[x + 1] = src_uyvy[3];
1674     src_uyvy += 4;
1675   }
1676   if (width & 1) {
1677     dst_y[width - 1] = src_uyvy[1];
1678   }
1679 }
1680
1681 #define BLEND(f, b, a) (((256 - a) * b) >> 8) + f
1682
1683 // Blend src_argb0 over src_argb1 and store to dst_argb.
1684 // dst_argb may be src_argb0 or src_argb1.
1685 // This code mimics the SSSE3 version for better testability.
1686 void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
1687                     uint8* dst_argb, int width) {
1688   int x;
1689   for (x = 0; x < width - 1; x += 2) {
1690     uint32 fb = src_argb0[0];
1691     uint32 fg = src_argb0[1];
1692     uint32 fr = src_argb0[2];
1693     uint32 a = src_argb0[3];
1694     uint32 bb = src_argb1[0];
1695     uint32 bg = src_argb1[1];
1696     uint32 br = src_argb1[2];
1697     dst_argb[0] = BLEND(fb, bb, a);
1698     dst_argb[1] = BLEND(fg, bg, a);
1699     dst_argb[2] = BLEND(fr, br, a);
1700     dst_argb[3] = 255u;
1701
1702     fb = src_argb0[4 + 0];
1703     fg = src_argb0[4 + 1];
1704     fr = src_argb0[4 + 2];
1705     a = src_argb0[4 + 3];
1706     bb = src_argb1[4 + 0];
1707     bg = src_argb1[4 + 1];
1708     br = src_argb1[4 + 2];
1709     dst_argb[4 + 0] = BLEND(fb, bb, a);
1710     dst_argb[4 + 1] = BLEND(fg, bg, a);
1711     dst_argb[4 + 2] = BLEND(fr, br, a);
1712     dst_argb[4 + 3] = 255u;
1713     src_argb0 += 8;
1714     src_argb1 += 8;
1715     dst_argb += 8;
1716   }
1717
1718   if (width & 1) {
1719     uint32 fb = src_argb0[0];
1720     uint32 fg = src_argb0[1];
1721     uint32 fr = src_argb0[2];
1722     uint32 a = src_argb0[3];
1723     uint32 bb = src_argb1[0];
1724     uint32 bg = src_argb1[1];
1725     uint32 br = src_argb1[2];
1726     dst_argb[0] = BLEND(fb, bb, a);
1727     dst_argb[1] = BLEND(fg, bg, a);
1728     dst_argb[2] = BLEND(fr, br, a);
1729     dst_argb[3] = 255u;
1730   }
1731 }
1732 #undef BLEND
1733 #define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
1734
1735 // Multiply source RGB by alpha and store to destination.
1736 // This code mimics the SSSE3 version for better testability.
1737 void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
1738   int i;
1739   for (i = 0; i < width - 1; i += 2) {
1740     uint32 b = src_argb[0];
1741     uint32 g = src_argb[1];
1742     uint32 r = src_argb[2];
1743     uint32 a = src_argb[3];
1744     dst_argb[0] = ATTENUATE(b, a);
1745     dst_argb[1] = ATTENUATE(g, a);
1746     dst_argb[2] = ATTENUATE(r, a);
1747     dst_argb[3] = a;
1748     b = src_argb[4];
1749     g = src_argb[5];
1750     r = src_argb[6];
1751     a = src_argb[7];
1752     dst_argb[4] = ATTENUATE(b, a);
1753     dst_argb[5] = ATTENUATE(g, a);
1754     dst_argb[6] = ATTENUATE(r, a);
1755     dst_argb[7] = a;
1756     src_argb += 8;
1757     dst_argb += 8;
1758   }
1759
1760   if (width & 1) {
1761     const uint32 b = src_argb[0];
1762     const uint32 g = src_argb[1];
1763     const uint32 r = src_argb[2];
1764     const uint32 a = src_argb[3];
1765     dst_argb[0] = ATTENUATE(b, a);
1766     dst_argb[1] = ATTENUATE(g, a);
1767     dst_argb[2] = ATTENUATE(r, a);
1768     dst_argb[3] = a;
1769   }
1770 }
1771 #undef ATTENUATE
1772
1773 // Divide source RGB by alpha and store to destination.
1774 // b = (b * 255 + (a / 2)) / a;
1775 // g = (g * 255 + (a / 2)) / a;
1776 // r = (r * 255 + (a / 2)) / a;
1777 // Reciprocal method is off by 1 on some values. ie 125
1778 // 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
1779 #define T(a) 0x01000000 + (0x10000 / a)
1780 const uint32 fixed_invtbl8[256] = {
1781   0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
1782   T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
1783   T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
1784   T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
1785   T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
1786   T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
1787   T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
1788   T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
1789   T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
1790   T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
1791   T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
1792   T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
1793   T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
1794   T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
1795   T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
1796   T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
1797   T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
1798   T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
1799   T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
1800   T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
1801   T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
1802   T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
1803   T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
1804   T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
1805   T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
1806   T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
1807   T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
1808   T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
1809   T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
1810   T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
1811   T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
1812   T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 };
1813 #undef T
1814
1815 void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
1816   int i;
1817   for (i = 0; i < width; ++i) {
1818     uint32 b = src_argb[0];
1819     uint32 g = src_argb[1];
1820     uint32 r = src_argb[2];
1821     const uint32 a = src_argb[3];
1822     const uint32 ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point
1823     b = (b * ia) >> 8;
1824     g = (g * ia) >> 8;
1825     r = (r * ia) >> 8;
1826     // Clamping should not be necessary but is free in assembly.
1827     dst_argb[0] = clamp255(b);
1828     dst_argb[1] = clamp255(g);
1829     dst_argb[2] = clamp255(r);
1830     dst_argb[3] = a;
1831     src_argb += 4;
1832     dst_argb += 4;
1833   }
1834 }
1835
1836 void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
1837                                const int32* previous_cumsum, int width) {
1838   int32 row_sum[4] = {0, 0, 0, 0};
1839   int x;
1840   for (x = 0; x < width; ++x) {
1841     row_sum[0] += row[x * 4 + 0];
1842     row_sum[1] += row[x * 4 + 1];
1843     row_sum[2] += row[x * 4 + 2];
1844     row_sum[3] += row[x * 4 + 3];
1845     cumsum[x * 4 + 0] = row_sum[0]  + previous_cumsum[x * 4 + 0];
1846     cumsum[x * 4 + 1] = row_sum[1]  + previous_cumsum[x * 4 + 1];
1847     cumsum[x * 4 + 2] = row_sum[2]  + previous_cumsum[x * 4 + 2];
1848     cumsum[x * 4 + 3] = row_sum[3]  + previous_cumsum[x * 4 + 3];
1849   }
1850 }
1851
1852 void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl,
1853                                 int w, int area, uint8* dst, int count) {
1854   float ooa = 1.0f / area;
1855   int i;
1856   for (i = 0; i < count; ++i) {
1857     dst[0] = (uint8)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
1858     dst[1] = (uint8)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
1859     dst[2] = (uint8)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
1860     dst[3] = (uint8)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
1861     dst += 4;
1862     tl += 4;
1863     bl += 4;
1864   }
1865 }
1866
1867 // Copy pixels from rotated source to destination row with a slope.
1868 LIBYUV_API
1869 void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
1870                      uint8* dst_argb, const float* uv_dudv, int width) {
1871   int i;
1872   // Render a row of pixels from source into a buffer.
1873   float uv[2];
1874   uv[0] = uv_dudv[0];
1875   uv[1] = uv_dudv[1];
1876   for (i = 0; i < width; ++i) {
1877     int x = (int)(uv[0]);
1878     int y = (int)(uv[1]);
1879     *(uint32*)(dst_argb) =
1880         *(const uint32*)(src_argb + y * src_argb_stride +
1881                                          x * 4);
1882     dst_argb += 4;
1883     uv[0] += uv_dudv[2];
1884     uv[1] += uv_dudv[3];
1885   }
1886 }
1887
1888 // Blend 2 rows into 1.
1889 static void HalfRow_C(const uint8* src_uv, int src_uv_stride,
1890                       uint8* dst_uv, int pix) {
1891   int x;
1892   for (x = 0; x < pix; ++x) {
1893     dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
1894   }
1895 }
1896
1897 static void HalfRow_16_C(const uint16* src_uv, int src_uv_stride,
1898                          uint16* dst_uv, int pix) {
1899   int x;
1900   for (x = 0; x < pix; ++x) {
1901     dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
1902   }
1903 }
1904
1905 // C version 2x2 -> 2x1.
1906 void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
1907                       ptrdiff_t src_stride,
1908                       int width, int source_y_fraction) {
1909   int y1_fraction = source_y_fraction;
1910   int y0_fraction = 256 - y1_fraction;
1911   const uint8* src_ptr1 = src_ptr + src_stride;
1912   int x;
1913   if (source_y_fraction == 0) {
1914     memcpy(dst_ptr, src_ptr, width);
1915     return;
1916   }
1917   if (source_y_fraction == 128) {
1918     HalfRow_C(src_ptr, (int)(src_stride), dst_ptr, width);
1919     return;
1920   }
1921   for (x = 0; x < width - 1; x += 2) {
1922     dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
1923     dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
1924     src_ptr += 2;
1925     src_ptr1 += 2;
1926     dst_ptr += 2;
1927   }
1928   if (width & 1) {
1929     dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
1930   }
1931 }
1932
1933 void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
1934                          ptrdiff_t src_stride,
1935                          int width, int source_y_fraction) {
1936   int y1_fraction = source_y_fraction;
1937   int y0_fraction = 256 - y1_fraction;
1938   const uint16* src_ptr1 = src_ptr + src_stride;
1939   int x;
1940   if (source_y_fraction == 0) {
1941     memcpy(dst_ptr, src_ptr, width * 2);
1942     return;
1943   }
1944   if (source_y_fraction == 128) {
1945     HalfRow_16_C(src_ptr, (int)(src_stride), dst_ptr, width);
1946     return;
1947   }
1948   for (x = 0; x < width - 1; x += 2) {
1949     dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
1950     dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
1951     src_ptr += 2;
1952     src_ptr1 += 2;
1953     dst_ptr += 2;
1954   }
1955   if (width & 1) {
1956     dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
1957   }
1958 }
1959
1960 // Select 2 channels from ARGB on alternating pixels.  e.g.  BGBGBGBG
1961 void ARGBToBayerRow_C(const uint8* src_argb,
1962                       uint8* dst_bayer, uint32 selector, int pix) {
1963   int index0 = selector & 0xff;
1964   int index1 = (selector >> 8) & 0xff;
1965   // Copy a row of Bayer.
1966   int x;
1967   for (x = 0; x < pix - 1; x += 2) {
1968     dst_bayer[0] = src_argb[index0];
1969     dst_bayer[1] = src_argb[index1];
1970     src_argb += 8;
1971     dst_bayer += 2;
1972   }
1973   if (pix & 1) {
1974     dst_bayer[0] = src_argb[index0];
1975   }
1976 }
1977
1978 // Select G channel from ARGB.  e.g.  GGGGGGGG
1979 void ARGBToBayerGGRow_C(const uint8* src_argb,
1980                         uint8* dst_bayer, uint32 selector, int pix) {
1981   // Copy a row of G.
1982   int x;
1983   for (x = 0; x < pix - 1; x += 2) {
1984     dst_bayer[0] = src_argb[1];
1985     dst_bayer[1] = src_argb[5];
1986     src_argb += 8;
1987     dst_bayer += 2;
1988   }
1989   if (pix & 1) {
1990     dst_bayer[0] = src_argb[1];
1991   }
1992 }
1993
1994 // Use first 4 shuffler values to reorder ARGB channels.
1995 void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
1996                       const uint8* shuffler, int pix) {
1997   int index0 = shuffler[0];
1998   int index1 = shuffler[1];
1999   int index2 = shuffler[2];
2000   int index3 = shuffler[3];
2001   // Shuffle a row of ARGB.
2002   int x;
2003   for (x = 0; x < pix; ++x) {
2004     // To support in-place conversion.
2005     uint8 b = src_argb[index0];
2006     uint8 g = src_argb[index1];
2007     uint8 r = src_argb[index2];
2008     uint8 a = src_argb[index3];
2009     dst_argb[0] = b;
2010     dst_argb[1] = g;
2011     dst_argb[2] = r;
2012     dst_argb[3] = a;
2013     src_argb += 4;
2014     dst_argb += 4;
2015   }
2016 }
2017
2018 void I422ToYUY2Row_C(const uint8* src_y,
2019                      const uint8* src_u,
2020                      const uint8* src_v,
2021                      uint8* dst_frame, int width) {
2022   int x;
2023   for (x = 0; x < width - 1; x += 2) {
2024     dst_frame[0] = src_y[0];
2025     dst_frame[1] = src_u[0];
2026     dst_frame[2] = src_y[1];
2027     dst_frame[3] = src_v[0];
2028     dst_frame += 4;
2029     src_y += 2;
2030     src_u += 1;
2031     src_v += 1;
2032   }
2033   if (width & 1) {
2034     dst_frame[0] = src_y[0];
2035     dst_frame[1] = src_u[0];
2036     dst_frame[2] = src_y[0];  // duplicate last y
2037     dst_frame[3] = src_v[0];
2038   }
2039 }
2040
2041 void I422ToUYVYRow_C(const uint8* src_y,
2042                      const uint8* src_u,
2043                      const uint8* src_v,
2044                      uint8* dst_frame, int width) {
2045   int x;
2046   for (x = 0; x < width - 1; x += 2) {
2047     dst_frame[0] = src_u[0];
2048     dst_frame[1] = src_y[0];
2049     dst_frame[2] = src_v[0];
2050     dst_frame[3] = src_y[1];
2051     dst_frame += 4;
2052     src_y += 2;
2053     src_u += 1;
2054     src_v += 1;
2055   }
2056   if (width & 1) {
2057     dst_frame[0] = src_u[0];
2058     dst_frame[1] = src_y[0];
2059     dst_frame[2] = src_v[0];
2060     dst_frame[3] = src_y[0];  // duplicate last y
2061   }
2062 }
2063
2064 #if !defined(LIBYUV_DISABLE_X86) && defined(HAS_I422TOARGBROW_SSSE3)
2065 // row_win.cc has asm version, but GCC uses 2 step wrapper.
2066 #if !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__))
2067 void I422ToRGB565Row_SSSE3(const uint8* src_y,
2068                            const uint8* src_u,
2069                            const uint8* src_v,
2070                            uint8* rgb_buf,
2071                            int width) {
2072   // Allocate a row of ARGB.
2073   align_buffer_64(row, width * 4);
2074   I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
2075   ARGBToRGB565Row_SSE2(row, rgb_buf, width);
2076   free_aligned_buffer_64(row);
2077 }
2078 #endif  // !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__))
2079
2080 #if defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
2081 void I422ToARGB1555Row_SSSE3(const uint8* src_y,
2082                              const uint8* src_u,
2083                              const uint8* src_v,
2084                              uint8* rgb_buf,
2085                              int width) {
2086   // Allocate a row of ARGB.
2087   align_buffer_64(row, width * 4);
2088   I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
2089   ARGBToARGB1555Row_SSE2(row, rgb_buf, width);
2090   free_aligned_buffer_64(row);
2091 }
2092
2093 void I422ToARGB4444Row_SSSE3(const uint8* src_y,
2094                              const uint8* src_u,
2095                              const uint8* src_v,
2096                              uint8* rgb_buf,
2097                              int width) {
2098   // Allocate a row of ARGB.
2099   align_buffer_64(row, width * 4);
2100   I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
2101   ARGBToARGB4444Row_SSE2(row, rgb_buf, width);
2102   free_aligned_buffer_64(row);
2103 }
2104
2105 void NV12ToRGB565Row_SSSE3(const uint8* src_y,
2106                            const uint8* src_uv,
2107                            uint8* dst_rgb565,
2108                            int width) {
2109   // Allocate a row of ARGB.
2110   align_buffer_64(row, width * 4);
2111   NV12ToARGBRow_SSSE3(src_y, src_uv, row, width);
2112   ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
2113   free_aligned_buffer_64(row);
2114 }
2115
2116 void NV21ToRGB565Row_SSSE3(const uint8* src_y,
2117                            const uint8* src_vu,
2118                            uint8* dst_rgb565,
2119                            int width) {
2120   // Allocate a row of ARGB.
2121   align_buffer_64(row, width * 4);
2122   NV21ToARGBRow_SSSE3(src_y, src_vu, row, width);
2123   ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
2124   free_aligned_buffer_64(row);
2125 }
2126
2127 void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
2128                          uint8* dst_argb,
2129                          int width) {
2130   // Allocate a rows of yuv.
2131   align_buffer_64(row_y, ((width + 63) & ~63) * 2);
2132   uint8* row_u = row_y + ((width + 63) & ~63);
2133   uint8* row_v = row_u + ((width + 63) & ~63) / 2;
2134   YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, width);
2135   YUY2ToYRow_SSE2(src_yuy2, row_y, width);
2136   I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
2137   free_aligned_buffer_64(row_y);
2138 }
2139
2140 void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
2141                          uint8* dst_argb,
2142                          int width) {
2143   // Allocate a rows of yuv.
2144   align_buffer_64(row_y, ((width + 63) & ~63) * 2);
2145   uint8* row_u = row_y + ((width + 63) & ~63);
2146   uint8* row_v = row_u + ((width + 63) & ~63) / 2;
2147   UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, width);
2148   UYVYToYRow_SSE2(src_uyvy, row_y, width);
2149   I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
2150   free_aligned_buffer_64(row_y);
2151 }
2152
2153 #endif  // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
2154 #endif  // !defined(LIBYUV_DISABLE_X86)
2155
2156 void ARGBPolynomialRow_C(const uint8* src_argb,
2157                          uint8* dst_argb, const float* poly,
2158                          int width) {
2159   int i;
2160   for (i = 0; i < width; ++i) {
2161     float b = (float)(src_argb[0]);
2162     float g = (float)(src_argb[1]);
2163     float r = (float)(src_argb[2]);
2164     float a = (float)(src_argb[3]);
2165     float b2 = b * b;
2166     float g2 = g * g;
2167     float r2 = r * r;
2168     float a2 = a * a;
2169     float db = poly[0] + poly[4] * b;
2170     float dg = poly[1] + poly[5] * g;
2171     float dr = poly[2] + poly[6] * r;
2172     float da = poly[3] + poly[7] * a;
2173     float b3 = b2 * b;
2174     float g3 = g2 * g;
2175     float r3 = r2 * r;
2176     float a3 = a2 * a;
2177     db += poly[8] * b2;
2178     dg += poly[9] * g2;
2179     dr += poly[10] * r2;
2180     da += poly[11] * a2;
2181     db += poly[12] * b3;
2182     dg += poly[13] * g3;
2183     dr += poly[14] * r3;
2184     da += poly[15] * a3;
2185
2186     dst_argb[0] = Clamp((int32)(db));
2187     dst_argb[1] = Clamp((int32)(dg));
2188     dst_argb[2] = Clamp((int32)(dr));
2189     dst_argb[3] = Clamp((int32)(da));
2190     src_argb += 4;
2191     dst_argb += 4;
2192   }
2193 }
2194
2195 void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
2196                              const uint8* luma, uint32 lumacoeff) {
2197   uint32 bc = lumacoeff & 0xff;
2198   uint32 gc = (lumacoeff >> 8) & 0xff;
2199   uint32 rc = (lumacoeff >> 16) & 0xff;
2200
2201   int i;
2202   for (i = 0; i < width - 1; i += 2) {
2203     // Luminance in rows, color values in columns.
2204     const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
2205                            src_argb[2] * rc) & 0x7F00u) + luma;
2206     const uint8* luma1;
2207     dst_argb[0] = luma0[src_argb[0]];
2208     dst_argb[1] = luma0[src_argb[1]];
2209     dst_argb[2] = luma0[src_argb[2]];
2210     dst_argb[3] = src_argb[3];
2211     luma1 = ((src_argb[4] * bc + src_argb[5] * gc +
2212               src_argb[6] * rc) & 0x7F00u) + luma;
2213     dst_argb[4] = luma1[src_argb[4]];
2214     dst_argb[5] = luma1[src_argb[5]];
2215     dst_argb[6] = luma1[src_argb[6]];
2216     dst_argb[7] = src_argb[7];
2217     src_argb += 8;
2218     dst_argb += 8;
2219   }
2220   if (width & 1) {
2221     // Luminance in rows, color values in columns.
2222     const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
2223                            src_argb[2] * rc) & 0x7F00u) + luma;
2224     dst_argb[0] = luma0[src_argb[0]];
2225     dst_argb[1] = luma0[src_argb[1]];
2226     dst_argb[2] = luma0[src_argb[2]];
2227     dst_argb[3] = src_argb[3];
2228   }
2229 }
2230
2231 void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {
2232   int i;
2233   for (i = 0; i < width - 1; i += 2) {
2234     dst[3] = src[3];
2235     dst[7] = src[7];
2236     dst += 8;
2237     src += 8;
2238   }
2239   if (width & 1) {
2240     dst[3] = src[3];
2241   }
2242 }
2243
2244 void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {
2245   int i;
2246   for (i = 0; i < width - 1; i += 2) {
2247     dst[3] = src[0];
2248     dst[7] = src[1];
2249     dst += 8;
2250     src += 2;
2251   }
2252   if (width & 1) {
2253     dst[3] = src[0];
2254   }
2255 }
2256
2257 #ifdef __cplusplus
2258 }  // extern "C"
2259 }  // namespace libyuv
2260 #endif