src/lib/engines/common/evas_convert_yuv.c

   1 /*
   2  * vim:ts=8:sw=3:sts=8:noexpandtab:cino=>5n-3f0^-2{2
   3  */
   4
   5 #include "evas_common.h"
   6 #include "evas_convert_yuv.h"
   7
   8 #if defined BUILD_MMX || defined BUILD_SSE
   9 # include "evas_mmx.h"
  10 #endif
  11
  12 #if defined HAVE_ALTIVEC_H
  13 # include <altivec.h>
  14 #ifdef CONFIG_DARWIN
  15 #define AVV(x...) (x)
  16 #else
  17 #define AVV(x...) {x}
  18 #endif
  19
  20 #endif
  21
  22 #ifdef BUILD_CONVERT_YUV
  23
  24 static void _evas_yuv_init         (void);
  25 static void _evas_yv12torgb_sse    (unsigned char **yuv, unsigned char *rgb, int w, int h);
  26 static void _evas_yv12torgb_mmx    (unsigned char **yuv, unsigned char *rgb, int w, int h);
  27 static void _evas_yv12torgb_altivec(unsigned char **yuv, unsigned char *rgb, int w, int h);
  28 static void _evas_yv12torgb_raster (unsigned char **yuv, unsigned char *rgb, int w, int h);
  29 static void _evas_yv12torgb_diz    (unsigned char **yuv, unsigned char *rgb, int w, int h);
  30
  31 #define CRV    104595
  32 #define CBU    132251
  33 #define CGU    25624
  34 #define CGV    53280
  35 #define YMUL   76283
  36 #define OFF    32768
  37 #define BITRES 16
  38
  39 /* calculation float resolution in bits */
  40 /* ie RES = 6 is 10.6 fixed point */
  41 /*    RES = 8 is 8.8 fixed point */
  42 /*    RES = 4 is 12.4 fixed point */
  43 /* NB: going above 6 will lead to overflow... :( */
  44 #define RES    6
  45
  46 #define RZ(i)  (i >> (BITRES - RES))
  47 #define FOUR(i) {i, i, i, i}
  48
  49 #if defined BUILD_MMX || defined BUILD_SSE
  50 __attribute__ ((aligned (8))) const volatile unsigned short _const_crvcrv[4] = FOUR(RZ(CRV));
  51 __attribute__ ((aligned (8))) const volatile unsigned short _const_cbucbu[4] = FOUR(RZ(CBU));
  52 __attribute__ ((aligned (8))) const volatile unsigned short _const_cgucgu[4] = FOUR(RZ(CGU));
  53 __attribute__ ((aligned (8))) const volatile unsigned short _const_cgvcgv[4] = FOUR(RZ(CGV));
  54 __attribute__ ((aligned (8))) const volatile unsigned short _const_ymul  [4] = FOUR(RZ(YMUL));
  55 __attribute__ ((aligned (8))) const volatile unsigned short _const_128   [4] = FOUR(128);
  56 __attribute__ ((aligned (8))) const volatile unsigned short _const_32    [4] = FOUR(RZ(OFF));
  57 __attribute__ ((aligned (8))) const volatile unsigned short _const_16    [4] = FOUR(16);
  58 __attribute__ ((aligned (8))) const volatile unsigned short _const_ff    [4] = FOUR(-1);
  59
  60 #define CONST_CRVCRV *_const_crvcrv
  61 #define CONST_CBUCBU *_const_cbucbu
  62 #define CONST_CGUCGU *_const_cgucgu
  63 #define CONST_CGVCGV *_const_cgvcgv
  64 #define CONST_YMUL   *_const_ymul
  65 #define CONST_128    *_const_128
  66 #define CONST_32     *_const_32
  67 #define CONST_16     *_const_16
  68 #define CONST_FF     *_const_ff
  69
  70 /* for C non aligned cleanup */
  71 const int _crv = RZ(CRV);   /* 1.596 */
  72 const int _cbu = RZ(CBU);   /* 2.018 */
  73 const int _cgu = RZ(CGU);   /* 0.391 */
  74 const int _cgv = RZ(CGV);   /* 0.813 */
  75
  76 #endif
  77
  78 #ifdef BUILD_ALTIVEC
  79 #ifdef __VEC__
  80 const vector unsigned short res     = AVV(RES);
  81 const vector signed short crv       = AVV(RZ(CRV));
  82 const vector signed short cbu       = AVV(RZ(CBU));
  83 const vector signed short cgu       = AVV(RZ(CGU));
  84 const vector signed short cgv       = AVV(RZ(CGV));
  85 const vector signed short ymul      = AVV(RZ(YMUL));
  86 const vector signed short c128      = AVV(128);
  87 const vector signed short c32       = AVV(RZ(OFF));
  88 const vector signed short c16       = AVV(16);
  89 const vector unsigned char zero     = AVV(0);
  90 const vector signed short maxchar   = AVV(255);
  91 const vector unsigned char pickrg1  = AVV(0, 0x1, 0x11, 0,
  92                                           0, 0x3, 0x13, 0,
  93                                           0, 0x5, 0x15, 0,
  94                                           0, 0x7, 0x17, 0);
  95 const vector unsigned char pickrg2  = AVV(0, 0x9, 0x19, 0,
  96                                           0, 0xb, 0x1b, 0,
  97                                           0, 0xd, 0x1d, 0,
  98                                           0, 0xf, 0x1f, 0);
  99 const vector unsigned char pickrgb1 = AVV(0x3, 0x1, 0x2, 0x11,
 100                                           0x7, 0x5, 0x6, 0x13,
 101                                           0xb, 0x9, 0xa, 0x15,
 102                                           0xf, 0xd, 0xe, 0x17);
 103 const vector unsigned char pickrgb2 = AVV(0x3, 0x1, 0x2, 0x19,
 104                                           0x7, 0x5, 0x6, 0x1b,
 105                                           0xb, 0x9, 0xa, 0x1d,
 106                                           0xf, 0xd, 0xe, 0x1f);
 107 #endif
 108 #endif
 109
 110 #ifdef BUILD_C
 111
 112 /* shortcut speedup lookup-tables */
 113 static short _v1164[256];
 114 static short _v1596[256];
 115 static short _v813[256];
 116 static short _v391[256];
 117 static short _v2018[256];
 118
 119 static unsigned char _clip_lut[1024];
 120 #define LUT_CLIP(i) ((_clip_lut+384)[(i)])
 121
 122 #define CMP_CLIP(i) ((i&256)? (~(i>>10)) : i);
 123
 124 #endif
 125
 126 void
 127 evas_common_convert_yuv_420p_601_rgba(DATA8 **src, DATA8 *dst, int w, int h)
 128 {
 129    int mmx, sse, sse2;
 130
 131 #if defined BUILD_MMX || defined BUILD_SSE
 132    evas_common_cpu_can_do(&mmx, &sse, &sse2);
 133 #endif
 134 #ifndef BUILD_SSE
 135    sse = 0;
 136    sse2 = 0;
 137 #endif
 138 #ifndef BUILD_MMX
 139    mmx = 0;
 140 #endif
 141    if (evas_common_cpu_has_feature(CPU_FEATURE_MMX2))
 142      _evas_yv12torgb_sse(src, dst, w, h);
 143    else if (evas_common_cpu_has_feature(CPU_FEATURE_MMX))
 144      _evas_yv12torgb_mmx(src, dst, w, h);
 145 #ifdef BUILD_ALTIVEC
 146    if (evas_common_cpu_has_feature(CPU_FEATURE_ALTIVEC))
 147      _evas_yv12torgb_altivec(src, dst, w, h);
 148 #endif
 149    else
 150      {
 151 #ifdef BUILD_C
 152         static int initted = 0;
 153
 154         if (!initted) _evas_yuv_init();
 155         initted = 1;
 156         /* FIXME: diz may be faster sometimes */
 157         _evas_yv12torgb_raster(src, dst, w, h);
 158 #endif
 159      }
 160 }
 161
 162 /* Thanks to Diz for this code. i've munged it a little and turned it into */
 163 /* inline macros. I tried beating it with a different algorithm using MMX */
 164 /* but failed. So here we are. This is the fastest YUV->RGB i know of for */
 165 /* x86. It has an issue that it doesnt convert colours accurately so the */
 166 /* image looks a little "yellowy". This is a result of only 10.6 fixed point */
 167 /* resolution as opposed to 16.16 in the C code. This could be fixed by */
 168 /* processing half the number of pixels per cycle and going up to 32bits */
 169 /* per element during compute, but it would all but negate the speedup */
 170 /* from mmx I think :( It might be possible to use SSE and SSE2 here, but */
 171 /* I haven't tried yet. Let's see. */
 172
 173 /* NB: XviD has almost the same code in it's assembly YV12->RGB code. same */
 174 /* algorithm, same constants, same all over actually, except it actually */
 175 /* does a few extra memory accesses that this one doesn't, so in theory */
 176 /* this code should be faster. In the end it's all just an mmx version of */
 177 /* the reference implimentation done with fixed point math */
 178
 179 static void
 180 _evas_yv12torgb_sse(unsigned char **yuv, unsigned char *rgb, int w, int h)
 181 {
 182 #ifdef BUILD_SSE
 183    int xx, yy;
 184    register unsigned char *yp1, *up, *vp;
 185    unsigned char *dp1;
 186
 187    /* destination pointers */
 188    dp1 = rgb;
 189
 190    for (yy = 0; yy < h; yy++)
 191      {
 192         /* plane pointers */
 193         yp1 = yuv[yy];
 194         up = yuv[h + (yy / 2)];
 195         vp = yuv[h + (h / 2) + (yy / 2)];
 196         for (xx = 0; xx < (w - 7); xx += 8)
 197           {
 198              movd_m2r(*up, mm3);
 199              movd_m2r(*vp, mm2);
 200              movq_m2r(*yp1, mm0);
 201
 202              pxor_r2r(mm7, mm7);
 203              punpcklbw_r2r(mm7, mm2);
 204              punpcklbw_r2r(mm7, mm3);
 205
 206              movq_r2r(mm0, mm1);
 207              psrlw_i2r(8, mm0);
 208              psllw_i2r(8, mm1);
 209              psrlw_i2r(8, mm1);
 210
 211              movq_m2r(CONST_16, mm4);
 212              psubsw_r2r(mm4, mm0);
 213              psubsw_r2r(mm4, mm1);
 214
 215              movq_m2r(CONST_128, mm5);
 216              psubsw_r2r(mm5, mm2);
 217              psubsw_r2r(mm5, mm3);
 218
 219              movq_m2r(CONST_YMUL, mm4);
 220              pmullw_r2r(mm4, mm0);
 221              pmullw_r2r(mm4, mm1);
 222
 223              movq_m2r(CONST_CRVCRV, mm7);
 224              pmullw_r2r(mm3, mm7);
 225              movq_m2r(CONST_CBUCBU, mm6);
 226              pmullw_r2r(mm2, mm6);
 227              movq_m2r(CONST_CGUCGU, mm5);
 228              pmullw_r2r(mm2, mm5);
 229              movq_m2r(CONST_CGVCGV, mm4);
 230              pmullw_r2r(mm3, mm4);
 231
 232              movq_r2r(mm0, mm2);
 233              paddsw_r2r(mm7, mm2);
 234              paddsw_r2r(mm1, mm7);
 235
 236              psraw_i2r(RES, mm2);
 237              psraw_i2r(RES, mm7);
 238              packuswb_r2r(mm7, mm2);
 239
 240              pxor_r2r(mm7, mm7);
 241              movq_r2r(mm2, mm3);
 242              punpckhbw_r2r(mm7, mm2);
 243              punpcklbw_r2r(mm3, mm7);
 244              por_r2r(mm7, mm2);
 245
 246              movq_r2r(mm0, mm3);
 247              psubsw_r2r(mm5, mm3);
 248              psubsw_r2r(mm4, mm3);
 249              paddsw_m2r(CONST_32, mm3);
 250
 251              movq_r2r(mm1, mm7);
 252              psubsw_r2r(mm5, mm7);
 253              psubsw_r2r(mm4, mm7);
 254              paddsw_m2r(CONST_32, mm7);
 255
 256              psraw_i2r(RES, mm3);
 257              psraw_i2r(RES, mm7);
 258              packuswb_r2r(mm7, mm3);
 259
 260              pxor_r2r(mm7, mm7);
 261              movq_r2r(mm3, mm4);
 262              punpckhbw_r2r(mm7, mm3);
 263              punpcklbw_r2r(mm4, mm7);
 264              por_r2r(mm7, mm3);
 265
 266              movq_m2r(CONST_32, mm4);
 267              paddsw_r2r(mm6, mm0);
 268              paddsw_r2r(mm6, mm1);
 269              paddsw_r2r(mm4, mm0);
 270              paddsw_r2r(mm4, mm1);
 271              psraw_i2r(RES, mm0);
 272              psraw_i2r(RES, mm1);
 273              packuswb_r2r(mm1, mm0);
 274
 275              pxor_r2r(mm7, mm7);
 276              movq_r2r(mm0, mm5);
 277              punpckhbw_r2r(mm7, mm0);
 278              punpcklbw_r2r(mm5, mm7);
 279              por_r2r(mm7, mm0);
 280
 281              movq_m2r(CONST_FF, mm1);
 282              movq_r2r(mm0, mm5);
 283              movq_r2r(mm3, mm6);
 284              movq_r2r(mm2, mm7);
 285              punpckhbw_r2r(mm3, mm2);
 286              punpcklbw_r2r(mm6, mm7);
 287              punpckhbw_r2r(mm1, mm0);
 288              punpcklbw_r2r(mm1, mm5);
 289
 290              movq_r2r(mm7, mm1);
 291              punpckhwd_r2r(mm5, mm7);
 292              punpcklwd_r2r(mm5, mm1);
 293
 294              movq_r2r(mm2, mm4);
 295              punpckhwd_r2r(mm0, mm2);
 296              punpcklwd_r2r(mm0, mm4);
 297
 298              movntq_r2m(mm1, *(dp1));
 299              movntq_r2m(mm7, *(dp1 + 8));
 300              movntq_r2m(mm4, *(dp1 + 16));
 301              movntq_r2m(mm2, *(dp1 + 24));
 302
 303              yp1 += 8;
 304              up += 4;
 305              vp += 4;
 306              dp1 += 8 * 4;
 307           }
 308         /* cleanup pixles that arent a multiple of 8 pixels wide */
 309         if (xx < w)
 310           {
 311              int y, u, v, r, g, b;
 312
 313              for (; xx < w; xx += 2)
 314                {
 315                   u = (*up++) - 128;
 316                   v = (*vp++) - 128;
 317
 318                   A_VAL(dp1) = 0xff;
 319                   y = RZ(YMUL) * ((*yp1++) - 16);
 320                   r = (y + (_crv * v)) >> RES;
 321                   r = LUT_CLIP(r);
 322                   R_VAL(dp1) = r;
 323                   g = (y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES;
 324                   g = LUT_CLIP(g);
 325                   G_VAL(dp1) = g;
 326                   b = (y + (_cbu * u) + RZ(OFF)) >> RES;
 327                   b = LUT_CLIP(b);
 328                   B_VAL(dp1) = b;
 329
 330                   dp1 += 4;
 331
 332                   A_VAL(dp1) = 0xff;
 333                   y = RZ(YMUL) * ((*yp1++) - 16);
 334                   r = (y + (_crv * v)) >> RES;
 335                   r = LUT_CLIP(r);
 336                   R_VAL(dp1) = r;
 337                   g = (y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES;
 338                   g = LUT_CLIP(g);
 339                   G_VAL(dp1) = g;
 340                   b = (y + (_cbu * u) + RZ(OFF)) >> RES;
 341                   b = LUT_CLIP(b);
 342                   B_VAL(dp1) = b;
 343
 344                   dp1 += 4;
 345                }
 346           }
 347      }
 348    emms();
 349 #else
 350    _evas_yv12torgb_mmx(yuv, rgb, w, h);
 351 #endif
 352 }
 353
 354 static void
 355 _evas_yv12torgb_mmx(unsigned char **yuv, unsigned char *rgb, int w, int h)
 356 {
 357 #ifdef BUILD_MMX
 358    int xx, yy;
 359    register unsigned char *yp1, *up, *vp;
 360    unsigned char *dp1;
 361
 362    /* destination pointers */
 363    dp1 = rgb;
 364
 365    for (yy = 0; yy < h; yy++)
 366      {
 367         /* plane pointers */
 368         yp1 = yuv[yy];
 369         up = yuv[h + (yy / 2)];
 370         vp = yuv[h + (h / 2) + (yy / 2)];
 371         for (xx = 0; xx < (w - 7); xx += 8)
 372           {
 373              movd_m2r(*up, mm3);
 374              movd_m2r(*vp, mm2);
 375              movq_m2r(*yp1, mm0);
 376
 377              pxor_r2r(mm7, mm7);
 378              punpcklbw_r2r(mm7, mm2);
 379              punpcklbw_r2r(mm7, mm3);
 380
 381              movq_r2r(mm0, mm1);
 382              psrlw_i2r(8, mm0);
 383              psllw_i2r(8, mm1);
 384              psrlw_i2r(8, mm1);
 385
 386              movq_m2r(CONST_16, mm4);
 387              psubsw_r2r(mm4, mm0);
 388              psubsw_r2r(mm4, mm1);
 389
 390              movq_m2r(CONST_128, mm5);
 391              psubsw_r2r(mm5, mm2);
 392              psubsw_r2r(mm5, mm3);
 393
 394              movq_m2r(CONST_YMUL, mm4);
 395              pmullw_r2r(mm4, mm0);
 396              pmullw_r2r(mm4, mm1);
 397
 398              movq_m2r(CONST_CRVCRV, mm7);
 399              pmullw_r2r(mm3, mm7);
 400              movq_m2r(CONST_CBUCBU, mm6);
 401              pmullw_r2r(mm2, mm6);
 402              movq_m2r(CONST_CGUCGU, mm5);
 403              pmullw_r2r(mm2, mm5);
 404              movq_m2r(CONST_CGVCGV, mm4);
 405              pmullw_r2r(mm3, mm4);
 406
 407              movq_r2r(mm0, mm2);
 408              paddsw_r2r(mm7, mm2);
 409              paddsw_r2r(mm1, mm7);
 410
 411              psraw_i2r(RES, mm2);
 412              psraw_i2r(RES, mm7);
 413              packuswb_r2r(mm7, mm2);
 414
 415              pxor_r2r(mm7, mm7);
 416              movq_r2r(mm2, mm3);
 417              punpckhbw_r2r(mm7, mm2);
 418              punpcklbw_r2r(mm3, mm7);
 419              por_r2r(mm7, mm2);
 420
 421              movq_r2r(mm0, mm3);
 422              psubsw_r2r(mm5, mm3);
 423              psubsw_r2r(mm4, mm3);
 424              paddsw_m2r(CONST_32, mm3);
 425
 426              movq_r2r(mm1, mm7);
 427              psubsw_r2r(mm5, mm7);
 428              psubsw_r2r(mm4, mm7);
 429              paddsw_m2r(CONST_32, mm7);
 430
 431              psraw_i2r(RES, mm3);
 432              psraw_i2r(RES, mm7);
 433              packuswb_r2r(mm7, mm3);
 434
 435              pxor_r2r(mm7, mm7);
 436              movq_r2r(mm3, mm4);
 437              punpckhbw_r2r(mm7, mm3);
 438              punpcklbw_r2r(mm4, mm7);
 439              por_r2r(mm7, mm3);
 440
 441              movq_m2r(CONST_32, mm4);
 442              paddsw_r2r(mm6, mm0);
 443              paddsw_r2r(mm6, mm1);
 444              paddsw_r2r(mm4, mm0);
 445              paddsw_r2r(mm4, mm1);
 446              psraw_i2r(RES, mm0);
 447              psraw_i2r(RES, mm1);
 448              packuswb_r2r(mm1, mm0);
 449
 450              pxor_r2r(mm7, mm7);
 451              movq_r2r(mm0, mm5);
 452              punpckhbw_r2r(mm7, mm0);
 453              punpcklbw_r2r(mm5, mm7);
 454              por_r2r(mm7, mm0);
 455
 456              movq_m2r(CONST_FF, mm1);
 457              movq_r2r(mm0, mm5);
 458              movq_r2r(mm3, mm6);
 459              movq_r2r(mm2, mm7);
 460              punpckhbw_r2r(mm3, mm2);
 461              punpcklbw_r2r(mm6, mm7);
 462              punpckhbw_r2r(mm1, mm0);
 463              punpcklbw_r2r(mm1, mm5);
 464
 465              movq_r2r(mm7, mm1);
 466              punpckhwd_r2r(mm5, mm7);
 467              punpcklwd_r2r(mm5, mm1);
 468
 469              movq_r2r(mm2, mm4);
 470              punpckhwd_r2r(mm0, mm2);
 471              punpcklwd_r2r(mm0, mm4);
 472
 473              movq_r2m(mm1, *(dp1));
 474              movq_r2m(mm7, *(dp1 + 8));
 475              movq_r2m(mm4, *(dp1 + 16));
 476              movq_r2m(mm2, *(dp1 + 24));
 477
 478              yp1 += 8;
 479              up += 4;
 480              vp += 4;
 481              dp1 += 8 * 4;
 482           }
 483         /* cleanup pixles that arent a multiple of 8 pixels wide */
 484         if (xx < w)
 485           {
 486              int y, u, v, r, g, b;
 487
 488              for (; xx < w; xx += 2)
 489                {
 490                   u = (*up++) - 128;
 491                   v = (*vp++) - 128;
 492
 493                   A_VAL(dp1) = 0xff;
 494                   y = RZ(YMUL) * ((*yp1++) - 16);
 495                   r = (y + (_crv * v)) >> RES;
 496                   r = LUT_CLIP(r);
 497                   R_VAL(dp1) = r;
 498                   g = (y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES;
 499                   g = LUT_CLIP(g);
 500                   G_VAL(dp1) = g;
 501                   b = (y + (_cbu * u) + RZ(OFF)) >> RES;
 502                   b = LUT_CLIP(b);
 503                   B_VAL(dp1) = b;
 504
 505                   dp1 += 4;
 506
 507                   A_VAL(dp1) = 0xff;
 508                   y = RZ(YMUL) * ((*yp1++) - 16);
 509                   r = (y + (_crv * v)) >> RES;
 510                   r = LUT_CLIP(r);
 511                   R_VAL(dp1) = r;
 512                   g = (y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES;
 513                   g = LUT_CLIP(g);
 514                   G_VAL(dp1) = g;
 515                   b = (y + (_cbu * u) + RZ(OFF)) >> RES;
 516                   b = LUT_CLIP(b);
 517                   B_VAL(dp1) = b;
 518
 519                   dp1 += 4;
 520                }
 521           }
 522      }
 523    emms();
 524 #else
 525    _evas_yv12torgb_raster(yuv, rgb, w, h);
 526 #endif
 527 }
 528
 529 static void
 530 _evas_yv12torgb_altivec(unsigned char **yuv, unsigned char *rgb, int w, int h)
 531 {
 532 #ifdef BUILD_ALTIVEC
 533 #ifdef __VEC__
 534    int xx, yy;
 535    int w2, h2;
 536    unsigned char *yp1, *yp2, *up, *vp;
 537    unsigned char *dp1, *dp2;
 538    vector signed short y, u, v;
 539    vector signed short r, g, b;
 540    vector signed short tmp1, tmp2, tmp3;
 541    vector unsigned char yperm, uperm, vperm, rgb1, rgb2;
 542    vector unsigned char alpha;
 543
 544    /* handy halved w & h */
 545    w2 = w / 2;
 546    h2 = h / 2;
 547    /* plane pointers */
 548    yp1 = yuv;
 549    yp2 = yuv + w;
 550    up = yuv + (w * h);
 551    vp = up + (w2 * h2);
 552    /* destination pointers */
 553    dp1 = rgb;
 554    dp2 = rgb + (w * 4);
 555
 556    alpha = vec_mergeh((vector unsigned char)AVV(255), zero);
 557    alpha = (vector unsigned char)vec_mergeh((vector unsigned short)alpha,
 558                                             (vector unsigned short)zero);
 559
 560    for (yy = 0; yy < h2; yy++)
 561      {
 562         for (xx = 0; xx < w2; xx += 4)
 563           {
 564 /* Cycles */
 565              /*
 566               * Load 4 y and 4 u & v pixels for the 8x2 pixel block.
 567               */
 568 /* 3 */      tmp3 = (vector signed short)vec_lde(0, (unsigned int *)yp1);
 569 /* 3 */      tmp1 = (vector signed short)vec_lde(0, (unsigned int *)up);
 570 /* 3 */      tmp2 = (vector signed short)vec_lde(0, (unsigned int *)vp);
 571
 572              /* Prepare for aligning the data in their vectors */
 573 /* 3 */      yperm = vec_lvsl(0, yp1);
 574 /* 3 */      uperm = vec_lvsl(0, up);
 575 /* 3 */      vperm = vec_lvsl(0, vp);
 576              yp1 += 4;
 577
 578              /* Save y and load the next 4 y pixels for a total of 8 */
 579 /* 2 */      y = vec_perm(tmp3, tmp3, yperm);
 580 /* 3 */      tmp3 = (vector signed short)vec_lde(0, (unsigned int *)yp1);
 581
 582              /* Setup and calculate the 4 u pixels */
 583 /* 2 */      tmp1 = vec_perm(tmp1, tmp1, uperm);
 584 /* 2 */      tmp2 = vec_perm(tmp2, tmp2, vperm);
 585
 586              /* Avoid dependancy stalls on yperm and calculate the 4 u values */
 587 /* 3 */      yperm = vec_lvsr(12, yp1);
 588 /* 1 */      tmp1 = (vector signed short)vec_mergeh((vector unsigned char)tmp1,
 589                                                     (vector unsigned char)tmp1);
 590 /* 1 */      u = (vector signed short)vec_mergeh(zero,
 591                                                  (vector unsigned char)tmp1);
 592
 593 /* 1 */      u = vec_sub(u, c128);
 594 /* 2 */      tmp3 = vec_perm(tmp3, tmp3, yperm);
 595
 596              /* Setup and calculate the 4 v values */
 597 /* 1 */      tmp2 = (vector signed short)vec_mergeh((vector unsigned char)tmp2,
 598                                                     (vector unsigned char)tmp2);
 599 /* 1 */      v = (vector signed short)vec_mergeh(zero,
 600                                                  (vector unsigned char)tmp2);
 601 /* 4 */      tmp2 = vec_mladd(cgu, u, (vector signed short)zero);
 602 /* 1 */      v = vec_sub(v, c128);
 603
 604              /* Move the data into y and start loading the next 4 pixels */
 605 /* 1 */      y = (vector signed short)vec_mergeh(zero,
 606                                                  (vector unsigned char)y);
 607 /* 1 */      tmp3 = (vector signed short)vec_mergeh(zero,
 608                                                     (vector unsigned char)tmp3);
 609 /* 1 */      y = vec_or(y, tmp3);
 610
 611              /* Finish calculating y */
 612 /* 1 */      y = vec_sub(y, c16);
 613 /* 4 */      y = vec_mladd(ymul, y, (vector signed short)zero);
 614
 615              /* Perform non-dependant multiplies first. */
 616 /* 4 */      tmp1 = vec_mladd(crv, v, y);
 617 /* 4 */      tmp2 = vec_mladd(cgv, v, tmp2);
 618 /* 4 */      tmp3 = vec_mladd(cbu, u, y);
 619
 620              /* Calculate rgb values */
 621 /* 1 */      r = vec_sra(tmp1, res);
 622
 623 /* 1 */      tmp2 = vec_sub(y, tmp2);
 624 /* 1 */      tmp2 = vec_add(tmp2, c32);
 625 /* 1 */      g = vec_sra(tmp2, res);
 626
 627 /* 1 */      tmp3 = vec_add(tmp3, c32);
 628 /* 1 */      b = vec_sra(tmp3, res);
 629
 630              /* Bound to 0 <= x <= 255 */
 631 /* 1 */      r = vec_min(r, maxchar);
 632 /* 1 */      g = vec_min(g, maxchar);
 633 /* 1 */      b = vec_min(b, maxchar);
 634 /* 1 */      r = vec_max(r, (vector signed short)zero);
 635 /* 1 */      g = vec_max(g, (vector signed short)zero);
 636 /* 1 */      b = vec_max(b, (vector signed short)zero);
 637
 638              /* Combine r, g and b. */
 639 /* 2 */      rgb1 = vec_perm((vector unsigned char)r, (vector unsigned char)g,
 640                              pickrg1);
 641 /* 2 */      rgb2 = vec_perm((vector unsigned char)r, (vector unsigned char)g,
 642                             pickrg2);
 643
 644 /* 2 */      rgb1 = vec_perm(rgb1, (vector unsigned char)b, pickrgb1);
 645 /* 2 */      rgb2 = vec_perm(rgb2, (vector unsigned char)b, pickrgb2);
 646
 647 /* 1 */      rgb1 = vec_or(alpha, rgb1);
 648 /* 1 */      rgb2 = vec_or(alpha, rgb2);
 649
 650 /* 3 */      vec_stl(rgb1, 0, dp1);
 651              dp1 += 16;
 652 /* 3 */      vec_stl(rgb2, 0, dp1);
 653
 654              /*
 655               * Begin the second row calculations
 656               */
 657
 658              /*
 659               * Load 4 y pixels for the 8x2 pixel block.
 660               */
 661 /* 3 */      yperm = vec_lvsl(0, yp2);
 662 /* 3 */      tmp3 = (vector signed short)vec_lde(0, (unsigned int *)yp2);
 663              yp2 += 4;
 664
 665              /* Save y and load the next 4 y pixels for a total of 8 */
 666 /* 2 */      y = vec_perm(tmp3, tmp3, yperm);
 667 /* 3 */      yperm = vec_lvsr(12, yp2);
 668 /* 3 */      tmp3 = (vector signed short)vec_lde(0, (unsigned int *)yp2);
 669 /* 1 */      y = (vector signed short)vec_mergeh(zero,
 670                                                  (vector unsigned char)y);
 671
 672              /* Avoid dependancy stalls on yperm */
 673 /* 2 */      tmp3 = vec_perm(tmp3, tmp3, yperm);
 674 /* 1 */      tmp3 = (vector signed short)vec_mergeh(zero,
 675                                                     (vector unsigned char)tmp3);
 676 /* 1 */      y = vec_or(y, tmp3);
 677
 678              /* Start the calculation for g */
 679 /* 4 */      tmp2 = vec_mladd(cgu, u, (vector signed short)zero);
 680
 681              /* Finish calculating y */
 682 /* 1 */      y = vec_sub(y, c16);
 683 /* 4 */      y = vec_mladd(ymul, y, (vector signed short)zero);
 684
 685              /* Perform non-dependant multiplies first. */
 686 /* 4 */      tmp2 = vec_mladd(cgv, v, tmp2);
 687 /* 4 */      tmp1 = vec_mladd(crv, v, y);
 688 /* 4 */      tmp3 = vec_mladd(cbu, u, y);
 689
 690              /* Calculate rgb values */
 691 /* 1 */      r = vec_sra(tmp1, res);
 692
 693 /* 1 */      tmp2 = vec_sub(y, tmp2);
 694 /* 1 */      tmp2 = vec_add(tmp2, c32);
 695 /* 1 */      g = vec_sra(tmp2, res);
 696
 697 /* 1 */      tmp3 = vec_add(tmp3, c32);
 698 /* 1 */      b = vec_sra(tmp3, res);
 699
 700              /* Bound to 0 <= x <= 255 */
 701 /* 1 */      r = vec_min(r, maxchar);
 702 /* 1 */      g = vec_min(g, maxchar);
 703 /* 1 */      b = vec_min(b, maxchar);
 704 /* 1 */      r = vec_max(r, (vector signed short)zero);
 705 /* 1 */      g = vec_max(g, (vector signed short)zero);
 706 /* 1 */      b = vec_max(b, (vector signed short)zero);
 707
 708              /* Combine r, g and b. */
 709 /* 2 */      rgb1 = vec_perm((vector unsigned char)r, (vector unsigned char)g,
 710                             pickrg1);
 711 /* 2 */      rgb2 = vec_perm((vector unsigned char)r, (vector unsigned char)g,
 712                             pickrg2);
 713
 714 /* 2 */      rgb1 = vec_perm(rgb1, (vector unsigned char)b, pickrgb1);
 715 /* 2 */      rgb2 = vec_perm(rgb2, (vector unsigned char)b, pickrgb2);
 716
 717 /* 1 */      rgb1 = vec_or(alpha, rgb1);
 718 /* 1 */      rgb2 = vec_or(alpha, rgb2);
 719
 720 /* 3 */      vec_stl(rgb1, 0, dp2);
 721              dp2 += 16;
 722 /* 3 */      vec_stl(rgb2, 0, dp2);
 723
 724              /* Increment the YUV data pointers to the next set of pixels. */
 725              yp1 += 4;
 726              yp2 += 4;
 727              up += 4;
 728              vp += 4;
 729
 730              /* Move the destination pointers to the next set of pixels. */
 731              dp1 += 16;
 732              dp2 += 16;
 733           }
 734
 735         /* jump down one line since we are doing 2 at once */
 736         yp1 += w;
 737         yp2 += w;
 738         dp1 += (w * 4);
 739         dp2 += (w * 4);
 740      }
 741 #endif
 742 #else
 743    _evas_yv12torgb_diz(yuv, rgb, w, h);
 744 #endif
 745 }
 746
 747 static void
 748 _evas_yuv_init(void)
 749 {
 750 #ifdef BUILD_C
 751    int i;
 752
 753    for (i = 0; i < 256; i++)
 754      {
 755         _v1164[i] = (int)(((float)(i - 16 )) * 1.164);
 756
 757         _v1596[i] = (int)(((float)(i - 128)) * 1.596);
 758         _v813[i]  = (int)(((float)(i - 128)) * 0.813);
 759
 760         _v391[i]  = (int)(((float)(i - 128)) * 0.391);
 761         _v2018[i] = (int)(((float)(i - 128)) * 2.018);
 762      }
 763
 764    for (i = -384; i < 640; i++)
 765      {
 766         _clip_lut[i+384] = i < 0 ? 0 : (i > 255) ? 255 : i;
 767      }
 768 #endif
 769 }
 770
 771 static void
 772 _evas_yv12torgb_diz(unsigned char **yuv, unsigned char *rgb, int w, int h)
 773 {
 774 #ifdef BUILD_C
 775    int xx, yy;
 776    int y, u, v, r, g, b;
 777    unsigned char *yp1, *yp2, *up, *vp;
 778    unsigned char *dp1, *dp2;
 779    int crv, cbu, cgu, cgv;
 780
 781    /* destination pointers */
 782    dp1 = rgb;
 783    dp2 = rgb + (w * 4);
 784
 785    crv = CRV;   /* 1.596 */
 786    cbu = CBU;   /* 2.018 */
 787    cgu = CGU;   /* 0.391 */
 788    cgv = CGU;   /* 0.813 */
 789
 790    for (yy = 0; yy < h; yy += 2)
 791      {
 792         /* plane pointers */
 793         yp1 = yuv[yy];
 794         yp2 = yuv[yy + 1];
 795         up = yuv[h + (yy / 2)];
 796         vp = yuv[h + (h / 2) + (yy / 2)];
 797         for (xx = 0; xx < w; xx += 2)
 798           {
 799              /* collect u & v for 2x2 pixel block */
 800              u = (*up++) - 128;
 801              v = (*vp++) - 128;
 802
 803              /* do the top 2 pixels of the 2x2 block whcih shared u & v */
 804              /* yuv to rgb */
 805              A_VAL(dp1) = 0xff;
 806              y = YMUL * ((*yp1++) - 16);
 807              r = (y + (crv * v)) >> 16;
 808              r = LUT_CLIP(r);
 809              R_VAL(dp1) = r;
 810              g = (y - (cgu * u) - (cgv * v) + OFF) >>16;
 811              g = LUT_CLIP(g);
 812              G_VAL(dp1) = g;
 813              b = (y + (cbu * u) + OFF) >> 16;
 814              b = LUT_CLIP(b);
 815              B_VAL(dp1) = b;
 816
 817              dp1 += 4;
 818
 819              /* yuv to rgb */
 820              A_VAL(dp1) = 0xff;
 821              y = YMUL * ((*yp1++) - 16);
 822              r = (y + (crv * v)) >> 16;
 823              r = LUT_CLIP(r);
 824              R_VAL(dp1) = r;
 825              g = (y - (cgu * u) - (cgv * v) + OFF) >>16;
 826              g = LUT_CLIP(g);
 827              G_VAL(dp1) = g;
 828              b = (y + (cbu * u) + OFF) >> 16;
 829              b = LUT_CLIP(b);
 830              B_VAL(dp1) = b;
 831
 832              dp1 += 4;
 833
 834              /* do the bottom 2 pixels */
 835              /* yuv to rgb */
 836              A_VAL(dp2) = 0xff;
 837              y = YMUL * ((*yp2++) - 16);
 838              r = (y + (crv * v)) >> 16;
 839              r = LUT_CLIP(r);
 840              R_VAL(dp2) = r;
 841              g = (y - (cgu * u) - (cgv * v) + OFF) >>16;
 842              g = LUT_CLIP(g);
 843              G_VAL(dp2) = g;
 844              b = (y + (cbu * u) + OFF) >> 16;
 845              b = LUT_CLIP(b);
 846              B_VAL(dp2) = b;
 847
 848              dp2 += 4;
 849
 850              /* yuv to rgb */
 851              A_VAL(dp2) = 0xff;
 852              y = YMUL * ((*yp2++) - 16);
 853              r = (y + (crv * v)) >> 16;
 854              r = LUT_CLIP(r);
 855              R_VAL(dp2) = r;
 856              g = (y - (cgu * u) - (cgv * v) + OFF) >>16;
 857              g = LUT_CLIP(g);
 858              G_VAL(dp2) = g;
 859              b = (y + (cbu * u) + OFF) >> 16;
 860              b = LUT_CLIP(b);
 861              B_VAL(dp2) = b;
 862
 863              dp2 += 4;
 864           }
 865         /* jump down one line since we are doing 2 at once */
 866         dp1 += (w * 4);
 867         dp2 += (w * 4);
 868      }
 869 #endif
 870 }
 871
 872 static void
 873 _evas_yv12torgb_raster(unsigned char **yuv, unsigned char *rgb, int w, int h)
 874 {
 875 #ifdef BUILD_C
 876    int xx, yy;
 877    int y, u, v, r, g, b;
 878    unsigned char *yp1, *yp2, *up, *vp;
 879    unsigned char *dp1, *dp2;
 880
 881    /* destination pointers */
 882    dp1 = rgb;
 883    dp2 = rgb + (w * 4);
 884
 885    for (yy = 0; yy < h; yy += 2)
 886      {
 887         /* plane pointers */
 888         yp1 = yuv[yy];
 889         yp2 = yuv[yy + 1];
 890         up = yuv[h + (yy / 2)];
 891         vp = yuv[h + (h / 2) + (yy / 2)];
 892         for (xx = 0; xx < w; xx += 2)
 893           {
 894              int vmu;
 895
 896              /* collect u & v for 2x2 pixel block */
 897              u = *up++;
 898              v = *vp++;
 899
 900              /* save lookups */
 901              vmu = _v813[v] + _v391[u];
 902              u = _v2018[u];
 903              v = _v1596[v];
 904
 905              /* do the top 2 pixels of the 2x2 block whcih shared u & v */
 906              /* yuv to rgb */
 907              y = _v1164[*yp1++];
 908              r = y + v;
 909              r = LUT_CLIP(r);
 910              g = y -  vmu;
 911              g = LUT_CLIP(g);
 912              b = y + u;
 913              b = LUT_CLIP(b);
 914
 915              A_VAL(dp1) = 0xff;
 916              R_VAL(dp1) = r;
 917              G_VAL(dp1) = g;
 918              B_VAL(dp1) = b;
 919
 920              dp1 += 4;
 921
 922              /* yuv to rgb */
 923              y = _v1164[*yp1++];
 924              r = y + v;
 925              r = LUT_CLIP(r);
 926              g = y -  vmu;
 927              g = LUT_CLIP(g);
 928              b = y + u;
 929              b = LUT_CLIP(b);
 930
 931              A_VAL(dp1) = 0xff;
 932              R_VAL(dp1) = r;
 933              G_VAL(dp1) = g;
 934              B_VAL(dp1) = b;
 935
 936              dp1 += 4;
 937
 938              /* do the bottom 2 pixels */
 939              /* yuv to rgb */
 940              y = _v1164[*yp2++];
 941              r = y + v;
 942              r = LUT_CLIP(r);
 943              g = y -  vmu;
 944              g = LUT_CLIP(g);
 945              b = y + u;
 946              b = LUT_CLIP(b);
 947
 948              A_VAL(dp2) = 0xff;
 949              R_VAL(dp2) = r;
 950              G_VAL(dp2) = g;
 951              B_VAL(dp2) = b;
 952
 953              dp2 += 4;
 954
 955              /* yuv to rgb */
 956              y = _v1164[*yp2++];
 957              r = y + v;
 958              r = LUT_CLIP(r);
 959              g = y -  vmu;
 960              g = LUT_CLIP(g);
 961              b = y + u;
 962              b = LUT_CLIP(b);
 963
 964              A_VAL(dp2) = 0xff;
 965              R_VAL(dp2) = r;
 966              G_VAL(dp2) = g;
 967              B_VAL(dp2) = b;
 968
 969              dp2 += 4;
 970           }
 971         /* jump down one line since we are doing 2 at once */
 972         dp1 += (w * 4);
 973         dp2 += (w * 4);
 974      }
 975 #endif
 976 }
 977
 978 #endif