src/lib/engines/common/evas_convert_yuv.c

   1 #include "evas_common.h"
   2 #include "evas_convert_yuv.h"
   3
   4 #if defined BUILD_MMX || defined BUILD_SSE
   5 # include "evas_mmx.h"
   6 #endif
   7
   8 #if defined HAVE_ALTIVEC_H
   9 # include <altivec.h>
  10 #ifdef CONFIG_DARWIN
  11 #define AVV(x...) (x)
  12 #else
  13 #define AVV(x...) {x}
  14 #endif
  15
  16 #endif
  17
  18 #ifdef BUILD_CONVERT_YUV
  19
  20 static void _evas_yuv_init         (void);
  21 static void _evas_yv12torgb_sse    (unsigned char **yuv, unsigned char *rgb, int w, int h);
  22 static void _evas_yv12torgb_mmx    (unsigned char **yuv, unsigned char *rgb, int w, int h);
  23 #ifdef BUILD_ALTIVEC
  24 static void _evas_yv12torgb_altivec(unsigned char **yuv, unsigned char *rgb, int w, int h);
  25 static void _evas_yv12torgb_diz    (unsigned char **yuv, unsigned char *rgb, int w, int h);
  26 #endif
  27 static void _evas_yv12torgb_raster (unsigned char **yuv, unsigned char *rgb, int w, int h);
  28 static void _evas_yuy2torgb_raster (unsigned char **yuv, unsigned char *rgb, int w, int h);
  29
  30 #define CRV    104595
  31 #define CBU    132251
  32 #define CGU    25624
  33 #define CGV    53280
  34 #define YMUL   76283
  35 #define OFF    32768
  36 #define BITRES 16
  37
  38 /* calculation float resolution in bits */
  39 /* ie RES = 6 is 10.6 fixed point */
  40 /*    RES = 8 is 8.8 fixed point */
  41 /*    RES = 4 is 12.4 fixed point */
  42 /* NB: going above 6 will lead to overflow... :( */
  43 #define RES    6
  44
  45 #define RZ(i)  (i >> (BITRES - RES))
  46 #define FOUR(i) {i, i, i, i}
  47
  48 #if defined BUILD_MMX || defined BUILD_SSE
  49 __attribute__ ((aligned (8))) const volatile unsigned short _const_crvcrv[4] = FOUR(RZ(CRV));
  50 __attribute__ ((aligned (8))) const volatile unsigned short _const_cbucbu[4] = FOUR(RZ(CBU));
  51 __attribute__ ((aligned (8))) const volatile unsigned short _const_cgucgu[4] = FOUR(RZ(CGU));
  52 __attribute__ ((aligned (8))) const volatile unsigned short _const_cgvcgv[4] = FOUR(RZ(CGV));
  53 __attribute__ ((aligned (8))) const volatile unsigned short _const_ymul  [4] = FOUR(RZ(YMUL));
  54 __attribute__ ((aligned (8))) const volatile unsigned short _const_128   [4] = FOUR(128);
  55 __attribute__ ((aligned (8))) const volatile unsigned short _const_32    [4] = FOUR(RZ(OFF));
  56 __attribute__ ((aligned (8))) const volatile unsigned short _const_16    [4] = FOUR(16);
  57 __attribute__ ((aligned (8))) const volatile unsigned short _const_ff    [4] = FOUR(-1);
  58
  59 #define CONST_CRVCRV *_const_crvcrv
  60 #define CONST_CBUCBU *_const_cbucbu
  61 #define CONST_CGUCGU *_const_cgucgu
  62 #define CONST_CGVCGV *_const_cgvcgv
  63 #define CONST_YMUL   *_const_ymul
  64 #define CONST_128    *_const_128
  65 #define CONST_32     *_const_32
  66 #define CONST_16     *_const_16
  67 #define CONST_FF     *_const_ff
  68
  69 /* for C non aligned cleanup */
  70 const int _crv = RZ(CRV);   /* 1.596 */
  71 const int _cbu = RZ(CBU);   /* 2.018 */
  72 const int _cgu = RZ(CGU);   /* 0.391 */
  73 const int _cgv = RZ(CGV);   /* 0.813 */
  74
  75 #endif
  76
  77 #ifdef BUILD_ALTIVEC
  78 #ifdef __VEC__
  79 const vector unsigned short res     = AVV(RES);
  80 const vector signed short crv       = AVV(RZ(CRV));
  81 const vector signed short cbu       = AVV(RZ(CBU));
  82 const vector signed short cgu       = AVV(RZ(CGU));
  83 const vector signed short cgv       = AVV(RZ(CGV));
  84 const vector signed short ymul      = AVV(RZ(YMUL));
  85 const vector signed short c128      = AVV(128);
  86 const vector signed short c32       = AVV(RZ(OFF));
  87 const vector signed short c16       = AVV(16);
  88 const vector unsigned char zero     = AVV(0);
  89 const vector signed short maxchar   = AVV(255);
  90 const vector unsigned char pickrg1  = AVV(0, 0x1, 0x11, 0,
  91                                           0, 0x3, 0x13, 0,
  92                                           0, 0x5, 0x15, 0,
  93                                           0, 0x7, 0x17, 0);
  94 const vector unsigned char pickrg2  = AVV(0, 0x9, 0x19, 0,
  95                                           0, 0xb, 0x1b, 0,
  96                                           0, 0xd, 0x1d, 0,
  97                                           0, 0xf, 0x1f, 0);
  98 const vector unsigned char pickrgb1 = AVV(0x3, 0x1, 0x2, 0x11,
  99                                           0x7, 0x5, 0x6, 0x13,
 100                                           0xb, 0x9, 0xa, 0x15,
 101                                           0xf, 0xd, 0xe, 0x17);
 102 const vector unsigned char pickrgb2 = AVV(0x3, 0x1, 0x2, 0x19,
 103                                           0x7, 0x5, 0x6, 0x1b,
 104                                           0xb, 0x9, 0xa, 0x1d,
 105                                           0xf, 0xd, 0xe, 0x1f);
 106 #endif
 107 #endif
 108
 109 #ifdef BUILD_C
 110
 111 /* shortcut speedup lookup-tables */
 112 static short _v1164[256];
 113 static short _v1596[256];
 114 static short _v813[256];
 115 static short _v391[256];
 116 static short _v2018[256];
 117
 118 static unsigned char _clip_lut[1024];
 119 #define LUT_CLIP(i) ((_clip_lut+384)[(i)])
 120
 121 #define CMP_CLIP(i) ((i&256)? (~(i>>10)) : i);
 122
 123 static int initted = 0;
 124
 125 #endif
 126
 127 void
 128 evas_common_convert_yuv_420p_601_rgba(DATA8 **src, DATA8 *dst, int w, int h)
 129 {
 130    int mmx, sse, sse2;
 131
 132 #if defined BUILD_MMX || defined BUILD_SSE
 133    evas_common_cpu_can_do(&mmx, &sse, &sse2);
 134 #endif
 135 #ifndef BUILD_SSE
 136    sse = 0;
 137    sse2 = 0;
 138 #endif
 139 #ifndef BUILD_MMX
 140    mmx = 0;
 141 #endif
 142    if (evas_common_cpu_has_feature(CPU_FEATURE_MMX2))
 143      _evas_yv12torgb_sse(src, dst, w, h);
 144    else if (evas_common_cpu_has_feature(CPU_FEATURE_MMX))
 145      _evas_yv12torgb_mmx(src, dst, w, h);
 146 #ifdef BUILD_ALTIVEC
 147    if (evas_common_cpu_has_feature(CPU_FEATURE_ALTIVEC))
 148      _evas_yv12torgb_altivec(src, dst, w, h);
 149 #endif
 150    else
 151      {
 152 #ifdef BUILD_C
 153         if (!initted) _evas_yuv_init();
 154         initted = 1;
 155         /* FIXME: diz may be faster sometimes */
 156         _evas_yv12torgb_raster(src, dst, w, h);
 157 #endif
 158      }
 159 }
 160
 161 /* Thanks to Diz for this code. i've munged it a little and turned it into */
 162 /* inline macros. I tried beating it with a different algorithm using MMX */
 163 /* but failed. So here we are. This is the fastest YUV->RGB i know of for */
 164 /* x86. It has an issue that it doesn't convert colours accurately so the */
 165 /* image looks a little "yellowy". This is a result of only 10.6 fixed point */
 166 /* resolution as opposed to 16.16 in the C code. This could be fixed by */
 167 /* processing half the number of pixels per cycle and going up to 32bits */
 168 /* per element during compute, but it would all but negate the speedup */
 169 /* from mmx I think :( It might be possible to use SSE and SSE2 here, but */
 170 /* I haven't tried yet. Let's see. */
 171
 172 /* NB: XviD has almost the same code in it's assembly YV12->RGB code. same */
 173 /* algorithm, same constants, same all over actually, except it actually */
 174 /* does a few extra memory accesses that this one doesn't, so in theory */
 175 /* this code should be faster. In the end it's all just an mmx version of */
 176 /* the reference implimentation done with fixed point math */
 177
 178 static void
 179 _evas_yv12torgb_sse(unsigned char **yuv, unsigned char *rgb, int w, int h)
 180 {
 181 #ifdef BUILD_SSE
 182    int xx, yy;
 183    register unsigned char *yp1, *up, *vp;
 184    unsigned char *dp1;
 185
 186    /* destination pointers */
 187    dp1 = rgb;
 188
 189    for (yy = 0; yy < h; yy++)
 190      {
 191         /* plane pointers */
 192         yp1 = yuv[yy];
 193         up = yuv[h + (yy / 2)];
 194         vp = yuv[h + (h / 2) + (yy / 2)];
 195         for (xx = 0; xx < (w - 7); xx += 8)
 196           {
 197              movd_m2r(*up, mm3);
 198              movd_m2r(*vp, mm2);
 199              movq_m2r(*yp1, mm0);
 200
 201              pxor_r2r(mm7, mm7);
 202              punpcklbw_r2r(mm7, mm2);
 203              punpcklbw_r2r(mm7, mm3);
 204
 205              movq_r2r(mm0, mm1);
 206              psrlw_i2r(8, mm0);
 207              psllw_i2r(8, mm1);
 208              psrlw_i2r(8, mm1);
 209
 210              movq_m2r(CONST_16, mm4);
 211              psubsw_r2r(mm4, mm0);
 212              psubsw_r2r(mm4, mm1);
 213
 214              movq_m2r(CONST_128, mm5);
 215              psubsw_r2r(mm5, mm2);
 216              psubsw_r2r(mm5, mm3);
 217
 218              movq_m2r(CONST_YMUL, mm4);
 219              pmullw_r2r(mm4, mm0);
 220              pmullw_r2r(mm4, mm1);
 221
 222              movq_m2r(CONST_CRVCRV, mm7);
 223              pmullw_r2r(mm3, mm7);
 224              movq_m2r(CONST_CBUCBU, mm6);
 225              pmullw_r2r(mm2, mm6);
 226              movq_m2r(CONST_CGUCGU, mm5);
 227              pmullw_r2r(mm2, mm5);
 228              movq_m2r(CONST_CGVCGV, mm4);
 229              pmullw_r2r(mm3, mm4);
 230
 231              movq_r2r(mm0, mm2);
 232              paddsw_r2r(mm7, mm2);
 233              paddsw_r2r(mm1, mm7);
 234
 235              psraw_i2r(RES, mm2);
 236              psraw_i2r(RES, mm7);
 237              packuswb_r2r(mm7, mm2);
 238
 239              pxor_r2r(mm7, mm7);
 240              movq_r2r(mm2, mm3);
 241              punpckhbw_r2r(mm7, mm2);
 242              punpcklbw_r2r(mm3, mm7);
 243              por_r2r(mm7, mm2);
 244
 245              movq_r2r(mm0, mm3);
 246              psubsw_r2r(mm5, mm3);
 247              psubsw_r2r(mm4, mm3);
 248              paddsw_m2r(CONST_32, mm3);
 249
 250              movq_r2r(mm1, mm7);
 251              psubsw_r2r(mm5, mm7);
 252              psubsw_r2r(mm4, mm7);
 253              paddsw_m2r(CONST_32, mm7);
 254
 255              psraw_i2r(RES, mm3);
 256              psraw_i2r(RES, mm7);
 257              packuswb_r2r(mm7, mm3);
 258
 259              pxor_r2r(mm7, mm7);
 260              movq_r2r(mm3, mm4);
 261              punpckhbw_r2r(mm7, mm3);
 262              punpcklbw_r2r(mm4, mm7);
 263              por_r2r(mm7, mm3);
 264
 265              movq_m2r(CONST_32, mm4);
 266              paddsw_r2r(mm6, mm0);
 267              paddsw_r2r(mm6, mm1);
 268              paddsw_r2r(mm4, mm0);
 269              paddsw_r2r(mm4, mm1);
 270              psraw_i2r(RES, mm0);
 271              psraw_i2r(RES, mm1);
 272              packuswb_r2r(mm1, mm0);
 273
 274              pxor_r2r(mm7, mm7);
 275              movq_r2r(mm0, mm5);
 276              punpckhbw_r2r(mm7, mm0);
 277              punpcklbw_r2r(mm5, mm7);
 278              por_r2r(mm7, mm0);
 279
 280              movq_m2r(CONST_FF, mm1);
 281              movq_r2r(mm0, mm5);
 282              movq_r2r(mm3, mm6);
 283              movq_r2r(mm2, mm7);
 284              punpckhbw_r2r(mm3, mm2);
 285              punpcklbw_r2r(mm6, mm7);
 286              punpckhbw_r2r(mm1, mm0);
 287              punpcklbw_r2r(mm1, mm5);
 288
 289              movq_r2r(mm7, mm1);
 290              punpckhwd_r2r(mm5, mm7);
 291              punpcklwd_r2r(mm5, mm1);
 292
 293              movq_r2r(mm2, mm4);
 294              punpckhwd_r2r(mm0, mm2);
 295              punpcklwd_r2r(mm0, mm4);
 296
 297              movntq_r2m(mm1, *(dp1));
 298              movntq_r2m(mm7, *(dp1 + 8));
 299              movntq_r2m(mm4, *(dp1 + 16));
 300              movntq_r2m(mm2, *(dp1 + 24));
 301
 302              yp1 += 8;
 303              up += 4;
 304              vp += 4;
 305              dp1 += 8 * 4;
 306           }
 307         /* cleanup pixles that arent a multiple of 8 pixels wide */
 308         if (xx < w)
 309           {
 310              int y, u, v, r, g, b;
 311
 312              for (; xx < w; xx += 2)
 313                {
 314                   u = (*up++) - 128;
 315                   v = (*vp++) - 128;
 316
 317                   y = RZ(YMUL) * ((*yp1++) - 16);
 318                   r = LUT_CLIP((y + (_crv * v)) >> RES);
 319                   g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES);
 320                   b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES);
 321                   *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b);
 322
 323                   dp1 += 4;
 324
 325                   y = RZ(YMUL) * ((*yp1++) - 16);
 326                   r = LUT_CLIP((y + (_crv * v)) >> RES);
 327                   g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES);
 328                   b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES);
 329                   *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b);
 330
 331                   dp1 += 4;
 332                }
 333           }
 334      }
 335    emms();
 336 #else
 337    _evas_yv12torgb_mmx(yuv, rgb, w, h);
 338 #endif
 339 }
 340
 341 static void
 342 _evas_yv12torgb_mmx(unsigned char **yuv, unsigned char *rgb, int w, int h)
 343 {
 344 #ifdef BUILD_MMX
 345    int xx, yy;
 346    register unsigned char *yp1, *up, *vp;
 347    unsigned char *dp1;
 348
 349    /* destination pointers */
 350    dp1 = rgb;
 351
 352    for (yy = 0; yy < h; yy++)
 353      {
 354         /* plane pointers */
 355         yp1 = yuv[yy];
 356         up = yuv[h + (yy / 2)];
 357         vp = yuv[h + (h / 2) + (yy / 2)];
 358         for (xx = 0; xx < (w - 7); xx += 8)
 359           {
 360              movd_m2r(*up, mm3);
 361              movd_m2r(*vp, mm2);
 362              movq_m2r(*yp1, mm0);
 363
 364              pxor_r2r(mm7, mm7);
 365              punpcklbw_r2r(mm7, mm2);
 366              punpcklbw_r2r(mm7, mm3);
 367
 368              movq_r2r(mm0, mm1);
 369              psrlw_i2r(8, mm0);
 370              psllw_i2r(8, mm1);
 371              psrlw_i2r(8, mm1);
 372
 373              movq_m2r(CONST_16, mm4);
 374              psubsw_r2r(mm4, mm0);
 375              psubsw_r2r(mm4, mm1);
 376
 377              movq_m2r(CONST_128, mm5);
 378              psubsw_r2r(mm5, mm2);
 379              psubsw_r2r(mm5, mm3);
 380
 381              movq_m2r(CONST_YMUL, mm4);
 382              pmullw_r2r(mm4, mm0);
 383              pmullw_r2r(mm4, mm1);
 384
 385              movq_m2r(CONST_CRVCRV, mm7);
 386              pmullw_r2r(mm3, mm7);
 387              movq_m2r(CONST_CBUCBU, mm6);
 388              pmullw_r2r(mm2, mm6);
 389              movq_m2r(CONST_CGUCGU, mm5);
 390              pmullw_r2r(mm2, mm5);
 391              movq_m2r(CONST_CGVCGV, mm4);
 392              pmullw_r2r(mm3, mm4);
 393
 394              movq_r2r(mm0, mm2);
 395              paddsw_r2r(mm7, mm2);
 396              paddsw_r2r(mm1, mm7);
 397
 398              psraw_i2r(RES, mm2);
 399              psraw_i2r(RES, mm7);
 400              packuswb_r2r(mm7, mm2);
 401
 402              pxor_r2r(mm7, mm7);
 403              movq_r2r(mm2, mm3);
 404              punpckhbw_r2r(mm7, mm2);
 405              punpcklbw_r2r(mm3, mm7);
 406              por_r2r(mm7, mm2);
 407
 408              movq_r2r(mm0, mm3);
 409              psubsw_r2r(mm5, mm3);
 410              psubsw_r2r(mm4, mm3);
 411              paddsw_m2r(CONST_32, mm3);
 412
 413              movq_r2r(mm1, mm7);
 414              psubsw_r2r(mm5, mm7);
 415              psubsw_r2r(mm4, mm7);
 416              paddsw_m2r(CONST_32, mm7);
 417
 418              psraw_i2r(RES, mm3);
 419              psraw_i2r(RES, mm7);
 420              packuswb_r2r(mm7, mm3);
 421
 422              pxor_r2r(mm7, mm7);
 423              movq_r2r(mm3, mm4);
 424              punpckhbw_r2r(mm7, mm3);
 425              punpcklbw_r2r(mm4, mm7);
 426              por_r2r(mm7, mm3);
 427
 428              movq_m2r(CONST_32, mm4);
 429              paddsw_r2r(mm6, mm0);
 430              paddsw_r2r(mm6, mm1);
 431              paddsw_r2r(mm4, mm0);
 432              paddsw_r2r(mm4, mm1);
 433              psraw_i2r(RES, mm0);
 434              psraw_i2r(RES, mm1);
 435              packuswb_r2r(mm1, mm0);
 436
 437              pxor_r2r(mm7, mm7);
 438              movq_r2r(mm0, mm5);
 439              punpckhbw_r2r(mm7, mm0);
 440              punpcklbw_r2r(mm5, mm7);
 441              por_r2r(mm7, mm0);
 442
 443              movq_m2r(CONST_FF, mm1);
 444              movq_r2r(mm0, mm5);
 445              movq_r2r(mm3, mm6);
 446              movq_r2r(mm2, mm7);
 447              punpckhbw_r2r(mm3, mm2);
 448              punpcklbw_r2r(mm6, mm7);
 449              punpckhbw_r2r(mm1, mm0);
 450              punpcklbw_r2r(mm1, mm5);
 451
 452              movq_r2r(mm7, mm1);
 453              punpckhwd_r2r(mm5, mm7);
 454              punpcklwd_r2r(mm5, mm1);
 455
 456              movq_r2r(mm2, mm4);
 457              punpckhwd_r2r(mm0, mm2);
 458              punpcklwd_r2r(mm0, mm4);
 459
 460              movq_r2m(mm1, *(dp1));
 461              movq_r2m(mm7, *(dp1 + 8));
 462              movq_r2m(mm4, *(dp1 + 16));
 463              movq_r2m(mm2, *(dp1 + 24));
 464
 465              yp1 += 8;
 466              up += 4;
 467              vp += 4;
 468              dp1 += 8 * 4;
 469           }
 470         /* cleanup pixles that arent a multiple of 8 pixels wide */
 471         if (xx < w)
 472           {
 473              int y, u, v, r, g, b;
 474
 475              for (; xx < w; xx += 2)
 476                {
 477                   u = (*up++) - 128;
 478                   v = (*vp++) - 128;
 479
 480                   y = RZ(YMUL) * ((*yp1++) - 16);
 481                   r = LUT_CLIP((y + (_crv * v)) >> RES);
 482                   g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES);
 483                   b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES);
 484                   *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b);
 485
 486                   dp1 += 4;
 487
 488                   y = RZ(YMUL) * ((*yp1++) - 16);
 489                   r = LUT_CLIP((y + (_crv * v)) >> RES);
 490                   g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES);
 491                   b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES);
 492                   *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b);
 493
 494                   dp1 += 4;
 495                }
 496           }
 497      }
 498    emms();
 499 #else
 500    _evas_yv12torgb_raster(yuv, rgb, w, h);
 501 #endif
 502 }
 503
 504 #ifdef BUILD_ALTIVEC
 505 static void
 506 _evas_yv12torgb_altivec(unsigned char **yuv, unsigned char *rgb, int w, int h)
 507 {
 508 #ifdef __VEC__
 509    int xx, yy;
 510    int w2, h2;
 511    unsigned char *yp1, *yp2, *up, *vp;
 512    unsigned char *dp1, *dp2;
 513    vector signed short y, u, v;
 514    vector signed short r, g, b;
 515    vector signed short tmp1, tmp2, tmp3;
 516    vector unsigned char yperm, uperm, vperm, rgb1, rgb2;
 517    vector unsigned char alpha;
 518
 519    /* handy halved w & h */
 520    w2 = w / 2;
 521    h2 = h / 2;
 522    /* plane pointers */
 523    yp1 = yuv;
 524    yp2 = yuv + w;
 525    up = yuv + (w * h);
 526    vp = up + (w2 * h2);
 527    /* destination pointers */
 528    dp1 = rgb;
 529    dp2 = rgb + (w * 4);
 530
 531    alpha = vec_mergeh((vector unsigned char)AVV(255), zero);
 532    alpha = (vector unsigned char)vec_mergeh((vector unsigned short)alpha,
 533                                             (vector unsigned short)zero);
 534
 535    for (yy = 0; yy < h2; yy++)
 536      {
 537         for (xx = 0; xx < w2; xx += 4)
 538           {
 539 /* Cycles */
 540              /*
 541               * Load 4 y and 4 u & v pixels for the 8x2 pixel block.
 542               */
 543 /* 3 */      tmp3 = (vector signed short)vec_lde(0, (unsigned int *)yp1);
 544 /* 3 */      tmp1 = (vector signed short)vec_lde(0, (unsigned int *)up);
 545 /* 3 */      tmp2 = (vector signed short)vec_lde(0, (unsigned int *)vp);
 546
 547              /* Prepare for aligning the data in their vectors */
 548 /* 3 */      yperm = vec_lvsl(0, yp1);
 549 /* 3 */      uperm = vec_lvsl(0, up);
 550 /* 3 */      vperm = vec_lvsl(0, vp);
 551              yp1 += 4;
 552
 553              /* Save y and load the next 4 y pixels for a total of 8 */
 554 /* 2 */      y = vec_perm(tmp3, tmp3, yperm);
 555 /* 3 */      tmp3 = (vector signed short)vec_lde(0, (unsigned int *)yp1);
 556
 557              /* Setup and calculate the 4 u pixels */
 558 /* 2 */      tmp1 = vec_perm(tmp1, tmp1, uperm);
 559 /* 2 */      tmp2 = vec_perm(tmp2, tmp2, vperm);
 560
 561              /* Avoid dependency stalls on yperm and calculate the 4 u values */
 562 /* 3 */      yperm = vec_lvsr(12, yp1);
 563 /* 1 */      tmp1 = (vector signed short)vec_mergeh((vector unsigned char)tmp1,
 564                                                     (vector unsigned char)tmp1);
 565 /* 1 */      u = (vector signed short)vec_mergeh(zero,
 566                                                  (vector unsigned char)tmp1);
 567
 568 /* 1 */      u = vec_sub(u, c128);
 569 /* 2 */      tmp3 = vec_perm(tmp3, tmp3, yperm);
 570
 571              /* Setup and calculate the 4 v values */
 572 /* 1 */      tmp2 = (vector signed short)vec_mergeh((vector unsigned char)tmp2,
 573                                                     (vector unsigned char)tmp2);
 574 /* 1 */      v = (vector signed short)vec_mergeh(zero,
 575                                                  (vector unsigned char)tmp2);
 576 /* 4 */      tmp2 = vec_mladd(cgu, u, (vector signed short)zero);
 577 /* 1 */      v = vec_sub(v, c128);
 578
 579              /* Move the data into y and start loading the next 4 pixels */
 580 /* 1 */      y = (vector signed short)vec_mergeh(zero,
 581                                                  (vector unsigned char)y);
 582 /* 1 */      tmp3 = (vector signed short)vec_mergeh(zero,
 583                                                     (vector unsigned char)tmp3);
 584 /* 1 */      y = vec_or(y, tmp3);
 585
 586              /* Finish calculating y */
 587 /* 1 */      y = vec_sub(y, c16);
 588 /* 4 */      y = vec_mladd(ymul, y, (vector signed short)zero);
 589
 590              /* Perform non-dependent multiplies first. */
 591 /* 4 */      tmp1 = vec_mladd(crv, v, y);
 592 /* 4 */      tmp2 = vec_mladd(cgv, v, tmp2);
 593 /* 4 */      tmp3 = vec_mladd(cbu, u, y);
 594
 595              /* Calculate rgb values */
 596 /* 1 */      r = vec_sra(tmp1, res);
 597
 598 /* 1 */      tmp2 = vec_sub(y, tmp2);
 599 /* 1 */      tmp2 = vec_add(tmp2, c32);
 600 /* 1 */      g = vec_sra(tmp2, res);
 601
 602 /* 1 */      tmp3 = vec_add(tmp3, c32);
 603 /* 1 */      b = vec_sra(tmp3, res);
 604
 605              /* Bound to 0 <= x <= 255 */
 606 /* 1 */      r = vec_min(r, maxchar);
 607 /* 1 */      g = vec_min(g, maxchar);
 608 /* 1 */      b = vec_min(b, maxchar);
 609 /* 1 */      r = vec_max(r, (vector signed short)zero);
 610 /* 1 */      g = vec_max(g, (vector signed short)zero);
 611 /* 1 */      b = vec_max(b, (vector signed short)zero);
 612
 613              /* Combine r, g and b. */
 614 /* 2 */      rgb1 = vec_perm((vector unsigned char)r, (vector unsigned char)g,
 615                              pickrg1);
 616 /* 2 */      rgb2 = vec_perm((vector unsigned char)r, (vector unsigned char)g,
 617                             pickrg2);
 618
 619 /* 2 */      rgb1 = vec_perm(rgb1, (vector unsigned char)b, pickrgb1);
 620 /* 2 */      rgb2 = vec_perm(rgb2, (vector unsigned char)b, pickrgb2);
 621
 622 /* 1 */      rgb1 = vec_or(alpha, rgb1);
 623 /* 1 */      rgb2 = vec_or(alpha, rgb2);
 624
 625 /* 3 */      vec_stl(rgb1, 0, dp1);
 626              dp1 += 16;
 627 /* 3 */      vec_stl(rgb2, 0, dp1);
 628
 629              /*
 630               * Begin the second row calculations
 631               */
 632
 633              /*
 634               * Load 4 y pixels for the 8x2 pixel block.
 635               */
 636 /* 3 */      yperm = vec_lvsl(0, yp2);
 637 /* 3 */      tmp3 = (vector signed short)vec_lde(0, (unsigned int *)yp2);
 638              yp2 += 4;
 639
 640              /* Save y and load the next 4 y pixels for a total of 8 */
 641 /* 2 */      y = vec_perm(tmp3, tmp3, yperm);
 642 /* 3 */      yperm = vec_lvsr(12, yp2);
 643 /* 3 */      tmp3 = (vector signed short)vec_lde(0, (unsigned int *)yp2);
 644 /* 1 */      y = (vector signed short)vec_mergeh(zero,
 645                                                  (vector unsigned char)y);
 646
 647              /* Avoid dependency stalls on yperm */
 648 /* 2 */      tmp3 = vec_perm(tmp3, tmp3, yperm);
 649 /* 1 */      tmp3 = (vector signed short)vec_mergeh(zero,
 650                                                     (vector unsigned char)tmp3);
 651 /* 1 */      y = vec_or(y, tmp3);
 652
 653              /* Start the calculation for g */
 654 /* 4 */      tmp2 = vec_mladd(cgu, u, (vector signed short)zero);
 655
 656              /* Finish calculating y */
 657 /* 1 */      y = vec_sub(y, c16);
 658 /* 4 */      y = vec_mladd(ymul, y, (vector signed short)zero);
 659
 660              /* Perform non-dependent multiplies first. */
 661 /* 4 */      tmp2 = vec_mladd(cgv, v, tmp2);
 662 /* 4 */      tmp1 = vec_mladd(crv, v, y);
 663 /* 4 */      tmp3 = vec_mladd(cbu, u, y);
 664
 665              /* Calculate rgb values */
 666 /* 1 */      r = vec_sra(tmp1, res);
 667
 668 /* 1 */      tmp2 = vec_sub(y, tmp2);
 669 /* 1 */      tmp2 = vec_add(tmp2, c32);
 670 /* 1 */      g = vec_sra(tmp2, res);
 671
 672 /* 1 */      tmp3 = vec_add(tmp3, c32);
 673 /* 1 */      b = vec_sra(tmp3, res);
 674
 675              /* Bound to 0 <= x <= 255 */
 676 /* 1 */      r = vec_min(r, maxchar);
 677 /* 1 */      g = vec_min(g, maxchar);
 678 /* 1 */      b = vec_min(b, maxchar);
 679 /* 1 */      r = vec_max(r, (vector signed short)zero);
 680 /* 1 */      g = vec_max(g, (vector signed short)zero);
 681 /* 1 */      b = vec_max(b, (vector signed short)zero);
 682
 683              /* Combine r, g and b. */
 684 /* 2 */      rgb1 = vec_perm((vector unsigned char)r, (vector unsigned char)g,
 685                             pickrg1);
 686 /* 2 */      rgb2 = vec_perm((vector unsigned char)r, (vector unsigned char)g,
 687                             pickrg2);
 688
 689 /* 2 */      rgb1 = vec_perm(rgb1, (vector unsigned char)b, pickrgb1);
 690 /* 2 */      rgb2 = vec_perm(rgb2, (vector unsigned char)b, pickrgb2);
 691
 692 /* 1 */      rgb1 = vec_or(alpha, rgb1);
 693 /* 1 */      rgb2 = vec_or(alpha, rgb2);
 694
 695 /* 3 */      vec_stl(rgb1, 0, dp2);
 696              dp2 += 16;
 697 /* 3 */      vec_stl(rgb2, 0, dp2);
 698
 699              /* Increment the YUV data pointers to the next set of pixels. */
 700              yp1 += 4;
 701              yp2 += 4;
 702              up += 4;
 703              vp += 4;
 704
 705              /* Move the destination pointers to the next set of pixels. */
 706              dp1 += 16;
 707              dp2 += 16;
 708           }
 709
 710         /* jump down one line since we are doing 2 at once */
 711         yp1 += w;
 712         yp2 += w;
 713         dp1 += (w * 4);
 714         dp2 += (w * 4);
 715      }
 716 #else
 717    _evas_yv12torgb_diz(yuv, rgb, w, h);
 718 #endif
 719 }
 720 #endif
 721
 722 static void
 723 _evas_yuv_init(void)
 724 {
 725 #ifdef BUILD_C
 726    int i;
 727
 728    for (i = 0; i < 256; i++)
 729      {
 730         _v1164[i] = (int)(((float)(i - 16 )) * 1.164);
 731
 732         _v1596[i] = (int)(((float)(i - 128)) * 1.596);
 733         _v813[i]  = (int)(((float)(i - 128)) * 0.813);
 734
 735         _v391[i]  = (int)(((float)(i - 128)) * 0.391);
 736         _v2018[i] = (int)(((float)(i - 128)) * 2.018);
 737      }
 738
 739    for (i = -384; i < 640; i++)
 740      {
 741         _clip_lut[i+384] = i < 0 ? 0 : (i > 255) ? 255 : i;
 742      }
 743 #endif
 744 }
 745
 746 #ifdef BUILD_ALTIVEC
 747 static void
 748 _evas_yv12torgb_diz(unsigned char **yuv, unsigned char *rgb, int w, int h)
 749 {
 750 #ifdef BUILD_C
 751    int xx, yy;
 752    int y, u, v, r, g, b;
 753    unsigned char *yp1, *yp2, *up, *vp;
 754    unsigned char *dp1, *dp2;
 755    int crv, cbu, cgu, cgv;
 756
 757    /* destination pointers */
 758    dp1 = rgb;
 759    dp2 = rgb + (w * 4);
 760
 761    crv = CRV;   /* 1.596 */
 762    cbu = CBU;   /* 2.018 */
 763    cgu = CGU;   /* 0.391 */
 764    cgv = CGV;   /* 0.813 */
 765
 766    for (yy = 0; yy < h; yy += 2)
 767      {
 768         /* plane pointers */
 769         yp1 = yuv[yy];
 770         yp2 = yuv[yy + 1];
 771         up = yuv[h + (yy / 2)];
 772         vp = yuv[h + (h / 2) + (yy / 2)];
 773         for (xx = 0; xx < w; xx += 2)
 774           {
 775              /* collect u & v for 2x2 pixel block */
 776              u = (*up++) - 128;
 777              v = (*vp++) - 128;
 778
 779              /* do the top 2 pixels of the 2x2 block which shared u & v */
 780              /* yuv to rgb */
 781              y = YMUL * ((*yp1++) - 16);
 782              r = LUT_CLIP((y + (crv * v)) >> 16);
 783              g = LUT_CLIP((y - (cgu * u) - (cgv * v) + OFF) >>16);
 784              b = LUT_CLIP((y + (cbu * u) + OFF) >> 16);
 785              *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b);
 786
 787              dp1 += 4;
 788
 789              /* yuv to rgb */
 790              y = YMUL * ((*yp1++) - 16);
 791              r = LUT_CLIP((y + (crv * v)) >> 16);
 792              g = LUT_CLIP((y - (cgu * u) - (cgv * v) + OFF) >>16);
 793              b = LUT_CLIP((y + (cbu * u) + OFF) >> 16);
 794              *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b);
 795
 796              dp1 += 4;
 797
 798              /* do the bottom 2 pixels */
 799              /* yuv to rgb */
 800              y = YMUL * ((*yp2++) - 16);
 801              r = LUT_CLIP((y + (crv * v)) >> 16);
 802              g = LUT_CLIP((y - (cgu * u) - (cgv * v) + OFF) >>16);
 803              b = LUT_CLIP((y + (cbu * u) + OFF) >> 16);
 804              *((DATA32 *) dp2) = 0xff000000 + RGB_JOIN(r,g,b);
 805
 806              dp2 += 4;
 807
 808              /* yuv to rgb */
 809              y = YMUL * ((*yp2++) - 16);
 810              r = LUT_CLIP((y + (crv * v)) >> 16);
 811              g = LUT_CLIP((y - (cgu * u) - (cgv * v) + OFF) >>16);
 812              b = LUT_CLIP((y + (cbu * u) + OFF) >> 16);
 813              *((DATA32 *) dp2) = 0xff000000 + RGB_JOIN(r,g,b);
 814
 815              dp2 += 4;
 816           }
 817         /* jump down one line since we are doing 2 at once */
 818         dp1 += (w * 4);
 819         dp2 += (w * 4);
 820      }
 821 #endif
 822 }
 823 #endif
 824
 825 static void
 826 _evas_yv12torgb_raster(unsigned char **yuv, unsigned char *rgb, int w, int h)
 827 {
 828 #ifdef BUILD_C
 829    int xx, yy;
 830    int y, u, v;
 831    unsigned char *yp1, *yp2, *up, *vp;
 832    unsigned char *dp1, *dp2;
 833
 834    /* destination pointers */
 835    dp1 = rgb;
 836    dp2 = rgb + (w * 4);
 837
 838    for (yy = 0; yy < h; yy += 2)
 839      {
 840         /* plane pointers */
 841         yp1 = yuv[yy];
 842         yp2 = yuv[yy + 1];
 843         up = yuv[h + (yy / 2)];
 844         vp = yuv[h + (h / 2) + (yy / 2)];
 845         for (xx = 0; xx < w; xx += 2)
 846           {
 847              int vmu;
 848
 849              /* collect u & v for 2x2 pixel block */
 850              u = *up++;
 851              v = *vp++;
 852
 853              /* save lookups */
 854              vmu = _v813[v] + _v391[u];
 855              u = _v2018[u];
 856              v = _v1596[v];
 857
 858              /* do the top 2 pixels of the 2x2 block which shared u & v */
 859              /* yuv to rgb */
 860              y = _v1164[*yp1++];
 861              *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
 862
 863              dp1 += 4;
 864
 865              /* yuv to rgb */
 866              y = _v1164[*yp1++];
 867              *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
 868
 869              dp1 += 4;
 870
 871              /* do the bottom 2 pixels */
 872              /* yuv to rgb */
 873              y = _v1164[*yp2++];
 874              *((DATA32 *) dp2) = 0xff000000 + RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
 875
 876              dp2 += 4;
 877
 878              /* yuv to rgb */
 879              y = _v1164[*yp2++];
 880              *((DATA32 *) dp2) = 0xff000000 + RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
 881
 882              dp2 += 4;
 883           }
 884         /* jump down one line since we are doing 2 at once */
 885         dp1 += (w * 4);
 886         dp2 += (w * 4);
 887      }
 888 #endif
 889 }
 890
 891 void
 892 evas_common_convert_yuv_422_601_rgba(DATA8 **src, DATA8 *dst, int w, int h)
 893 {
 894 #ifdef BUILD_C
 895    if (!initted) _evas_yuv_init();
 896    initted = 1;
 897    _evas_yuy2torgb_raster(src, dst, w, h);
 898 #endif
 899 }
 900
 901 static void
 902 _evas_yuy2torgb_raster(unsigned char **yuv, unsigned char *rgb, int w, int h)
 903 {
 904 #ifdef BUILD_C
 905    int xx, yy;
 906    int y, u, v;
 907    unsigned char *yp1, *yp2, *up, *vp;
 908    unsigned char *dp1;
 909
 910    dp1 = rgb;
 911
 912    /* destination pointers */
 913    for (yy = 0; yy < h; yy++)
 914      {
 915         /* plane pointers */
 916         unsigned char *line;
 917
 918         line = yuv[yy];
 919         yp1 = line + 0;
 920         up = line + 1;
 921         yp2 = line + 2;
 922         vp = line + 3;
 923
 924         for (xx = 0; xx < w; xx += 2)
 925           {
 926              int vmu;
 927
 928              /* collect u & v for 2 pixels block */
 929              u = *up;
 930              v = *vp;
 931
 932              /* save lookups */
 933              vmu = _v813[v] + _v391[u];
 934              u = _v2018[u];
 935              v = _v1596[v];
 936
 937              /* do the top 2 pixels of the 2x2 block which shared u & v */
 938              /* yuv to rgb */
 939              y = _v1164[*yp1];
 940              *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
 941
 942              dp1 += 4;
 943
 944              /* yuv to rgb */
 945              y = _v1164[*yp2];
 946              *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
 947
 948              dp1 += 4;
 949
 950              yp1 += 4; yp2 += 4; up += 4; vp += 4;
 951           }
 952      }
 953 #endif
 954 }
 955
 956 #endif
 957