src/lib/engines/common/evas_convert_yuv.c

   1 #include "evas_common.h"
   2 #include "evas_convert_yuv.h"
   3
   4 #if defined BUILD_MMX || defined BUILD_SSE
   5 # include "evas_mmx.h"
   6 #endif
   7
   8 #if defined HAVE_ALTIVEC_H
   9 # include <altivec.h>
  10 #ifdef CONFIG_DARWIN
  11 #define AVV(x...) (x)
  12 #else
  13 #define AVV(x...) {x}
  14 #endif
  15
  16 #endif
  17
  18 #ifdef BUILD_CONVERT_YUV
  19
  20 static void _evas_yuv_init         (void);
  21 static void _evas_yv12torgb_sse    (unsigned char **yuv, unsigned char *rgb, int w, int h);
  22 static void _evas_yv12torgb_mmx    (unsigned char **yuv, unsigned char *rgb, int w, int h);
  23 #ifdef BUILD_ALTIVEC
  24 static void _evas_yv12torgb_altivec(unsigned char **yuv, unsigned char *rgb, int w, int h);
  25 static void _evas_yv12torgb_diz    (unsigned char **yuv, unsigned char *rgb, int w, int h);
  26 #endif
  27 static void _evas_yv12torgb_raster (unsigned char **yuv, unsigned char *rgb, int w, int h);
  28 static void _evas_yuy2torgb_raster (unsigned char **yuv, unsigned char *rgb, int w, int h);
  29 static void _evas_nv12torgb_raster (unsigned char **yuv, unsigned char *rgb, int w, int h);
  30 static void _evas_nv12tiledtorgb_raster(unsigned char **yuv, unsigned char *rgb, int w, int h);
  31
  32 #define CRV    104595
  33 #define CBU    132251
  34 #define CGU    25624
  35 #define CGV    53280
  36 #define YMUL   76283
  37 #define OFF    32768
  38 #define BITRES 16
  39
  40 /* calculation float resolution in bits */
  41 /* ie RES = 6 is 10.6 fixed point */
  42 /*    RES = 8 is 8.8 fixed point */
  43 /*    RES = 4 is 12.4 fixed point */
  44 /* NB: going above 6 will lead to overflow... :( */
  45 #define RES    6
  46
  47 #define RZ(i)  (i >> (BITRES - RES))
  48 #define FOUR(i) {i, i, i, i}
  49
  50 #if defined BUILD_MMX || defined BUILD_SSE
  51 __attribute__ ((aligned (8))) const volatile unsigned short _const_crvcrv[4] = FOUR(RZ(CRV));
  52 __attribute__ ((aligned (8))) const volatile unsigned short _const_cbucbu[4] = FOUR(RZ(CBU));
  53 __attribute__ ((aligned (8))) const volatile unsigned short _const_cgucgu[4] = FOUR(RZ(CGU));
  54 __attribute__ ((aligned (8))) const volatile unsigned short _const_cgvcgv[4] = FOUR(RZ(CGV));
  55 __attribute__ ((aligned (8))) const volatile unsigned short _const_ymul  [4] = FOUR(RZ(YMUL));
  56 __attribute__ ((aligned (8))) const volatile unsigned short _const_128   [4] = FOUR(128);
  57 __attribute__ ((aligned (8))) const volatile unsigned short _const_32    [4] = FOUR(RZ(OFF));
  58 __attribute__ ((aligned (8))) const volatile unsigned short _const_16    [4] = FOUR(16);
  59 __attribute__ ((aligned (8))) const volatile unsigned short _const_ff    [4] = FOUR(-1);
  60
  61 #define CONST_CRVCRV *_const_crvcrv
  62 #define CONST_CBUCBU *_const_cbucbu
  63 #define CONST_CGUCGU *_const_cgucgu
  64 #define CONST_CGVCGV *_const_cgvcgv
  65 #define CONST_YMUL   *_const_ymul
  66 #define CONST_128    *_const_128
  67 #define CONST_32     *_const_32
  68 #define CONST_16     *_const_16
  69 #define CONST_FF     *_const_ff
  70
  71 /* for C non aligned cleanup */
  72 const int _crv = RZ(CRV);   /* 1.596 */
  73 const int _cbu = RZ(CBU);   /* 2.018 */
  74 const int _cgu = RZ(CGU);   /* 0.391 */
  75 const int _cgv = RZ(CGV);   /* 0.813 */
  76
  77 #endif
  78
  79 #ifdef BUILD_ALTIVEC
  80 #ifdef __VEC__
  81 const vector unsigned short res     = AVV(RES);
  82 const vector signed short crv       = AVV(RZ(CRV));
  83 const vector signed short cbu       = AVV(RZ(CBU));
  84 const vector signed short cgu       = AVV(RZ(CGU));
  85 const vector signed short cgv       = AVV(RZ(CGV));
  86 const vector signed short ymul      = AVV(RZ(YMUL));
  87 const vector signed short c128      = AVV(128);
  88 const vector signed short c32       = AVV(RZ(OFF));
  89 const vector signed short c16       = AVV(16);
  90 const vector unsigned char zero     = AVV(0);
  91 const vector signed short maxchar   = AVV(255);
  92 const vector unsigned char pickrg1  = AVV(0, 0x1, 0x11, 0,
  93                                           0, 0x3, 0x13, 0,
  94                                           0, 0x5, 0x15, 0,
  95                                           0, 0x7, 0x17, 0);
  96 const vector unsigned char pickrg2  = AVV(0, 0x9, 0x19, 0,
  97                                           0, 0xb, 0x1b, 0,
  98                                           0, 0xd, 0x1d, 0,
  99                                           0, 0xf, 0x1f, 0);
 100 const vector unsigned char pickrgb1 = AVV(0x3, 0x1, 0x2, 0x11,
 101                                           0x7, 0x5, 0x6, 0x13,
 102                                           0xb, 0x9, 0xa, 0x15,
 103                                           0xf, 0xd, 0xe, 0x17);
 104 const vector unsigned char pickrgb2 = AVV(0x3, 0x1, 0x2, 0x19,
 105                                           0x7, 0x5, 0x6, 0x1b,
 106                                           0xb, 0x9, 0xa, 0x1d,
 107                                           0xf, 0xd, 0xe, 0x1f);
 108 #endif
 109 #endif
 110
 111 #ifdef BUILD_C
 112
 113 /* shortcut speedup lookup-tables */
 114 static short _v1164[256];
 115 static short _v1596[256];
 116 static short _v813[256];
 117 static short _v391[256];
 118 static short _v2018[256];
 119
 120 static unsigned char _clip_lut[1024];
 121 #define LUT_CLIP(i) ((_clip_lut+384)[(i)])
 122
 123 #define CMP_CLIP(i) ((i&256)? (~(i>>10)) : i);
 124
 125 static int initted = 0;
 126
 127 #endif
 128
 129 void
 130 evas_common_convert_yuv_420p_601_rgba(DATA8 **src, DATA8 *dst, int w, int h)
 131 {
 132    int mmx, sse, sse2;
 133
 134 #if defined BUILD_MMX || defined BUILD_SSE
 135    evas_common_cpu_can_do(&mmx, &sse, &sse2);
 136 #endif
 137 #ifndef BUILD_SSE
 138    sse = 0;
 139    sse2 = 0;
 140 #endif
 141 #ifndef BUILD_MMX
 142    mmx = 0;
 143 #endif
 144    if (evas_common_cpu_has_feature(CPU_FEATURE_MMX2))
 145      _evas_yv12torgb_sse(src, dst, w, h);
 146    else if (evas_common_cpu_has_feature(CPU_FEATURE_MMX))
 147      _evas_yv12torgb_mmx(src, dst, w, h);
 148 #ifdef BUILD_ALTIVEC
 149    if (evas_common_cpu_has_feature(CPU_FEATURE_ALTIVEC))
 150      _evas_yv12torgb_altivec(src, dst, w, h);
 151 #endif
 152    else
 153      {
 154 #ifdef BUILD_C
 155         if (!initted) _evas_yuv_init();
 156         initted = 1;
 157         /* FIXME: diz may be faster sometimes */
 158         _evas_yv12torgb_raster(src, dst, w, h);
 159 #endif
 160      }
 161 }
 162
 163 /* Thanks to Diz for this code. i've munged it a little and turned it into */
 164 /* inline macros. I tried beating it with a different algorithm using MMX */
 165 /* but failed. So here we are. This is the fastest YUV->RGB i know of for */
 166 /* x86. It has an issue that it doesn't convert colours accurately so the */
 167 /* image looks a little "yellowy". This is a result of only 10.6 fixed point */
 168 /* resolution as opposed to 16.16 in the C code. This could be fixed by */
 169 /* processing half the number of pixels per cycle and going up to 32bits */
 170 /* per element during compute, but it would all but negate the speedup */
 171 /* from mmx I think :( It might be possible to use SSE and SSE2 here, but */
 172 /* I haven't tried yet. Let's see. */
 173
 174 /* NB: XviD has almost the same code in it's assembly YV12->RGB code. same */
 175 /* algorithm, same constants, same all over actually, except it actually */
 176 /* does a few extra memory accesses that this one doesn't, so in theory */
 177 /* this code should be faster. In the end it's all just an mmx version of */
 178 /* the reference implimentation done with fixed point math */
 179
 180 static void
 181 _evas_yv12torgb_sse(unsigned char **yuv, unsigned char *rgb, int w, int h)
 182 {
 183 #ifdef BUILD_SSE
 184    int xx, yy;
 185    register unsigned char *yp1, *up, *vp;
 186    unsigned char *dp1;
 187
 188    /* destination pointers */
 189    dp1 = rgb;
 190
 191    for (yy = 0; yy < h; yy++)
 192      {
 193         /* plane pointers */
 194         yp1 = yuv[yy];
 195         up = yuv[h + (yy / 2)];
 196         vp = yuv[h + (h / 2) + (yy / 2)];
 197         for (xx = 0; xx < (w - 7); xx += 8)
 198           {
 199              movd_m2r(*up, mm3);
 200              movd_m2r(*vp, mm2);
 201              movq_m2r(*yp1, mm0);
 202
 203              pxor_r2r(mm7, mm7);
 204              punpcklbw_r2r(mm7, mm2);
 205              punpcklbw_r2r(mm7, mm3);
 206
 207              movq_r2r(mm0, mm1);
 208              psrlw_i2r(8, mm0);
 209              psllw_i2r(8, mm1);
 210              psrlw_i2r(8, mm1);
 211
 212              movq_m2r(CONST_16, mm4);
 213              psubsw_r2r(mm4, mm0);
 214              psubsw_r2r(mm4, mm1);
 215
 216              movq_m2r(CONST_128, mm5);
 217              psubsw_r2r(mm5, mm2);
 218              psubsw_r2r(mm5, mm3);
 219
 220              movq_m2r(CONST_YMUL, mm4);
 221              pmullw_r2r(mm4, mm0);
 222              pmullw_r2r(mm4, mm1);
 223
 224              movq_m2r(CONST_CRVCRV, mm7);
 225              pmullw_r2r(mm3, mm7);
 226              movq_m2r(CONST_CBUCBU, mm6);
 227              pmullw_r2r(mm2, mm6);
 228              movq_m2r(CONST_CGUCGU, mm5);
 229              pmullw_r2r(mm2, mm5);
 230              movq_m2r(CONST_CGVCGV, mm4);
 231              pmullw_r2r(mm3, mm4);
 232
 233              movq_r2r(mm0, mm2);
 234              paddsw_r2r(mm7, mm2);
 235              paddsw_r2r(mm1, mm7);
 236
 237              psraw_i2r(RES, mm2);
 238              psraw_i2r(RES, mm7);
 239              packuswb_r2r(mm7, mm2);
 240
 241              pxor_r2r(mm7, mm7);
 242              movq_r2r(mm2, mm3);
 243              punpckhbw_r2r(mm7, mm2);
 244              punpcklbw_r2r(mm3, mm7);
 245              por_r2r(mm7, mm2);
 246
 247              movq_r2r(mm0, mm3);
 248              psubsw_r2r(mm5, mm3);
 249              psubsw_r2r(mm4, mm3);
 250              paddsw_m2r(CONST_32, mm3);
 251
 252              movq_r2r(mm1, mm7);
 253              psubsw_r2r(mm5, mm7);
 254              psubsw_r2r(mm4, mm7);
 255              paddsw_m2r(CONST_32, mm7);
 256
 257              psraw_i2r(RES, mm3);
 258              psraw_i2r(RES, mm7);
 259              packuswb_r2r(mm7, mm3);
 260
 261              pxor_r2r(mm7, mm7);
 262              movq_r2r(mm3, mm4);
 263              punpckhbw_r2r(mm7, mm3);
 264              punpcklbw_r2r(mm4, mm7);
 265              por_r2r(mm7, mm3);
 266
 267              movq_m2r(CONST_32, mm4);
 268              paddsw_r2r(mm6, mm0);
 269              paddsw_r2r(mm6, mm1);
 270              paddsw_r2r(mm4, mm0);
 271              paddsw_r2r(mm4, mm1);
 272              psraw_i2r(RES, mm0);
 273              psraw_i2r(RES, mm1);
 274              packuswb_r2r(mm1, mm0);
 275
 276              pxor_r2r(mm7, mm7);
 277              movq_r2r(mm0, mm5);
 278              punpckhbw_r2r(mm7, mm0);
 279              punpcklbw_r2r(mm5, mm7);
 280              por_r2r(mm7, mm0);
 281
 282              movq_m2r(CONST_FF, mm1);
 283              movq_r2r(mm0, mm5);
 284              movq_r2r(mm3, mm6);
 285              movq_r2r(mm2, mm7);
 286              punpckhbw_r2r(mm3, mm2);
 287              punpcklbw_r2r(mm6, mm7);
 288              punpckhbw_r2r(mm1, mm0);
 289              punpcklbw_r2r(mm1, mm5);
 290
 291              movq_r2r(mm7, mm1);
 292              punpckhwd_r2r(mm5, mm7);
 293              punpcklwd_r2r(mm5, mm1);
 294
 295              movq_r2r(mm2, mm4);
 296              punpckhwd_r2r(mm0, mm2);
 297              punpcklwd_r2r(mm0, mm4);
 298
 299              movntq_r2m(mm1, *(dp1));
 300              movntq_r2m(mm7, *(dp1 + 8));
 301              movntq_r2m(mm4, *(dp1 + 16));
 302              movntq_r2m(mm2, *(dp1 + 24));
 303
 304              yp1 += 8;
 305              up += 4;
 306              vp += 4;
 307              dp1 += 8 * 4;
 308           }
 309         /* cleanup pixles that arent a multiple of 8 pixels wide */
 310         if (xx < w)
 311           {
 312              int y, u, v, r, g, b;
 313
 314              for (; xx < w; xx += 2)
 315                {
 316                   u = (*up++) - 128;
 317                   v = (*vp++) - 128;
 318
 319                   y = RZ(YMUL) * ((*yp1++) - 16);
 320                   r = LUT_CLIP((y + (_crv * v)) >> RES);
 321                   g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES);
 322                   b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES);
 323                   *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b);
 324
 325                   dp1 += 4;
 326
 327                   y = RZ(YMUL) * ((*yp1++) - 16);
 328                   r = LUT_CLIP((y + (_crv * v)) >> RES);
 329                   g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES);
 330                   b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES);
 331                   *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b);
 332
 333                   dp1 += 4;
 334                }
 335           }
 336      }
 337    emms();
 338 #else
 339    _evas_yv12torgb_mmx(yuv, rgb, w, h);
 340 #endif
 341 }
 342
 343 static void
 344 _evas_yv12torgb_mmx(unsigned char **yuv, unsigned char *rgb, int w, int h)
 345 {
 346 #ifdef BUILD_MMX
 347    int xx, yy;
 348    register unsigned char *yp1, *up, *vp;
 349    unsigned char *dp1;
 350
 351    /* destination pointers */
 352    dp1 = rgb;
 353
 354    for (yy = 0; yy < h; yy++)
 355      {
 356         /* plane pointers */
 357         yp1 = yuv[yy];
 358         up = yuv[h + (yy / 2)];
 359         vp = yuv[h + (h / 2) + (yy / 2)];
 360         for (xx = 0; xx < (w - 7); xx += 8)
 361           {
 362              movd_m2r(*up, mm3);
 363              movd_m2r(*vp, mm2);
 364              movq_m2r(*yp1, mm0);
 365
 366              pxor_r2r(mm7, mm7);
 367              punpcklbw_r2r(mm7, mm2);
 368              punpcklbw_r2r(mm7, mm3);
 369
 370              movq_r2r(mm0, mm1);
 371              psrlw_i2r(8, mm0);
 372              psllw_i2r(8, mm1);
 373              psrlw_i2r(8, mm1);
 374
 375              movq_m2r(CONST_16, mm4);
 376              psubsw_r2r(mm4, mm0);
 377              psubsw_r2r(mm4, mm1);
 378
 379              movq_m2r(CONST_128, mm5);
 380              psubsw_r2r(mm5, mm2);
 381              psubsw_r2r(mm5, mm3);
 382
 383              movq_m2r(CONST_YMUL, mm4);
 384              pmullw_r2r(mm4, mm0);
 385              pmullw_r2r(mm4, mm1);
 386
 387              movq_m2r(CONST_CRVCRV, mm7);
 388              pmullw_r2r(mm3, mm7);
 389              movq_m2r(CONST_CBUCBU, mm6);
 390              pmullw_r2r(mm2, mm6);
 391              movq_m2r(CONST_CGUCGU, mm5);
 392              pmullw_r2r(mm2, mm5);
 393              movq_m2r(CONST_CGVCGV, mm4);
 394              pmullw_r2r(mm3, mm4);
 395
 396              movq_r2r(mm0, mm2);
 397              paddsw_r2r(mm7, mm2);
 398              paddsw_r2r(mm1, mm7);
 399
 400              psraw_i2r(RES, mm2);
 401              psraw_i2r(RES, mm7);
 402              packuswb_r2r(mm7, mm2);
 403
 404              pxor_r2r(mm7, mm7);
 405              movq_r2r(mm2, mm3);
 406              punpckhbw_r2r(mm7, mm2);
 407              punpcklbw_r2r(mm3, mm7);
 408              por_r2r(mm7, mm2);
 409
 410              movq_r2r(mm0, mm3);
 411              psubsw_r2r(mm5, mm3);
 412              psubsw_r2r(mm4, mm3);
 413              paddsw_m2r(CONST_32, mm3);
 414
 415              movq_r2r(mm1, mm7);
 416              psubsw_r2r(mm5, mm7);
 417              psubsw_r2r(mm4, mm7);
 418              paddsw_m2r(CONST_32, mm7);
 419
 420              psraw_i2r(RES, mm3);
 421              psraw_i2r(RES, mm7);
 422              packuswb_r2r(mm7, mm3);
 423
 424              pxor_r2r(mm7, mm7);
 425              movq_r2r(mm3, mm4);
 426              punpckhbw_r2r(mm7, mm3);
 427              punpcklbw_r2r(mm4, mm7);
 428              por_r2r(mm7, mm3);
 429
 430              movq_m2r(CONST_32, mm4);
 431              paddsw_r2r(mm6, mm0);
 432              paddsw_r2r(mm6, mm1);
 433              paddsw_r2r(mm4, mm0);
 434              paddsw_r2r(mm4, mm1);
 435              psraw_i2r(RES, mm0);
 436              psraw_i2r(RES, mm1);
 437              packuswb_r2r(mm1, mm0);
 438
 439              pxor_r2r(mm7, mm7);
 440              movq_r2r(mm0, mm5);
 441              punpckhbw_r2r(mm7, mm0);
 442              punpcklbw_r2r(mm5, mm7);
 443              por_r2r(mm7, mm0);
 444
 445              movq_m2r(CONST_FF, mm1);
 446              movq_r2r(mm0, mm5);
 447              movq_r2r(mm3, mm6);
 448              movq_r2r(mm2, mm7);
 449              punpckhbw_r2r(mm3, mm2);
 450              punpcklbw_r2r(mm6, mm7);
 451              punpckhbw_r2r(mm1, mm0);
 452              punpcklbw_r2r(mm1, mm5);
 453
 454              movq_r2r(mm7, mm1);
 455              punpckhwd_r2r(mm5, mm7);
 456              punpcklwd_r2r(mm5, mm1);
 457
 458              movq_r2r(mm2, mm4);
 459              punpckhwd_r2r(mm0, mm2);
 460              punpcklwd_r2r(mm0, mm4);
 461
 462              movq_r2m(mm1, *(dp1));
 463              movq_r2m(mm7, *(dp1 + 8));
 464              movq_r2m(mm4, *(dp1 + 16));
 465              movq_r2m(mm2, *(dp1 + 24));
 466
 467              yp1 += 8;
 468              up += 4;
 469              vp += 4;
 470              dp1 += 8 * 4;
 471           }
 472         /* cleanup pixles that arent a multiple of 8 pixels wide */
 473         if (xx < w)
 474           {
 475              int y, u, v, r, g, b;
 476
 477              for (; xx < w; xx += 2)
 478                {
 479                   u = (*up++) - 128;
 480                   v = (*vp++) - 128;
 481
 482                   y = RZ(YMUL) * ((*yp1++) - 16);
 483                   r = LUT_CLIP((y + (_crv * v)) >> RES);
 484                   g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES);
 485                   b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES);
 486                   *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b);
 487
 488                   dp1 += 4;
 489
 490                   y = RZ(YMUL) * ((*yp1++) - 16);
 491                   r = LUT_CLIP((y + (_crv * v)) >> RES);
 492                   g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES);
 493                   b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES);
 494                   *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b);
 495
 496                   dp1 += 4;
 497                }
 498           }
 499      }
 500    emms();
 501 #else
 502    _evas_yv12torgb_raster(yuv, rgb, w, h);
 503 #endif
 504 }
 505
 506 #ifdef BUILD_ALTIVEC
 507 static void
 508 _evas_yv12torgb_altivec(unsigned char **yuv, unsigned char *rgb, int w, int h)
 509 {
 510 #ifdef __VEC__
 511    int xx, yy;
 512    int w2, h2;
 513    unsigned char *yp1, *yp2, *up, *vp;
 514    unsigned char *dp1, *dp2;
 515    vector signed short y, u, v;
 516    vector signed short r, g, b;
 517    vector signed short tmp1, tmp2, tmp3;
 518    vector unsigned char yperm, uperm, vperm, rgb1, rgb2;
 519    vector unsigned char alpha;
 520
 521    /* handy halved w & h */
 522    w2 = w / 2;
 523    h2 = h / 2;
 524    /* plane pointers */
 525    yp1 = yuv;
 526    yp2 = yuv + w;
 527    up = yuv + (w * h);
 528    vp = up + (w2 * h2);
 529    /* destination pointers */
 530    dp1 = rgb;
 531    dp2 = rgb + (w * 4);
 532
 533    alpha = vec_mergeh((vector unsigned char)AVV(255), zero);
 534    alpha = (vector unsigned char)vec_mergeh((vector unsigned short)alpha,
 535                                             (vector unsigned short)zero);
 536
 537    for (yy = 0; yy < h2; yy++)
 538      {
 539         for (xx = 0; xx < w2; xx += 4)
 540           {
 541 /* Cycles */
 542              /*
 543               * Load 4 y and 4 u & v pixels for the 8x2 pixel block.
 544               */
 545 /* 3 */      tmp3 = (vector signed short)vec_lde(0, (unsigned int *)yp1);
 546 /* 3 */      tmp1 = (vector signed short)vec_lde(0, (unsigned int *)up);
 547 /* 3 */      tmp2 = (vector signed short)vec_lde(0, (unsigned int *)vp);
 548
 549              /* Prepare for aligning the data in their vectors */
 550 /* 3 */      yperm = vec_lvsl(0, yp1);
 551 /* 3 */      uperm = vec_lvsl(0, up);
 552 /* 3 */      vperm = vec_lvsl(0, vp);
 553              yp1 += 4;
 554
 555              /* Save y and load the next 4 y pixels for a total of 8 */
 556 /* 2 */      y = vec_perm(tmp3, tmp3, yperm);
 557 /* 3 */      tmp3 = (vector signed short)vec_lde(0, (unsigned int *)yp1);
 558
 559              /* Setup and calculate the 4 u pixels */
 560 /* 2 */      tmp1 = vec_perm(tmp1, tmp1, uperm);
 561 /* 2 */      tmp2 = vec_perm(tmp2, tmp2, vperm);
 562
 563              /* Avoid dependency stalls on yperm and calculate the 4 u values */
 564 /* 3 */      yperm = vec_lvsr(12, yp1);
 565 /* 1 */      tmp1 = (vector signed short)vec_mergeh((vector unsigned char)tmp1,
 566                                                     (vector unsigned char)tmp1);
 567 /* 1 */      u = (vector signed short)vec_mergeh(zero,
 568                                                  (vector unsigned char)tmp1);
 569
 570 /* 1 */      u = vec_sub(u, c128);
 571 /* 2 */      tmp3 = vec_perm(tmp3, tmp3, yperm);
 572
 573              /* Setup and calculate the 4 v values */
 574 /* 1 */      tmp2 = (vector signed short)vec_mergeh((vector unsigned char)tmp2,
 575                                                     (vector unsigned char)tmp2);
 576 /* 1 */      v = (vector signed short)vec_mergeh(zero,
 577                                                  (vector unsigned char)tmp2);
 578 /* 4 */      tmp2 = vec_mladd(cgu, u, (vector signed short)zero);
 579 /* 1 */      v = vec_sub(v, c128);
 580
 581              /* Move the data into y and start loading the next 4 pixels */
 582 /* 1 */      y = (vector signed short)vec_mergeh(zero,
 583                                                  (vector unsigned char)y);
 584 /* 1 */      tmp3 = (vector signed short)vec_mergeh(zero,
 585                                                     (vector unsigned char)tmp3);
 586 /* 1 */      y = vec_or(y, tmp3);
 587
 588              /* Finish calculating y */
 589 /* 1 */      y = vec_sub(y, c16);
 590 /* 4 */      y = vec_mladd(ymul, y, (vector signed short)zero);
 591
 592              /* Perform non-dependent multiplies first. */
 593 /* 4 */      tmp1 = vec_mladd(crv, v, y);
 594 /* 4 */      tmp2 = vec_mladd(cgv, v, tmp2);
 595 /* 4 */      tmp3 = vec_mladd(cbu, u, y);
 596
 597              /* Calculate rgb values */
 598 /* 1 */      r = vec_sra(tmp1, res);
 599
 600 /* 1 */      tmp2 = vec_sub(y, tmp2);
 601 /* 1 */      tmp2 = vec_add(tmp2, c32);
 602 /* 1 */      g = vec_sra(tmp2, res);
 603
 604 /* 1 */      tmp3 = vec_add(tmp3, c32);
 605 /* 1 */      b = vec_sra(tmp3, res);
 606
 607              /* Bound to 0 <= x <= 255 */
 608 /* 1 */      r = vec_min(r, maxchar);
 609 /* 1 */      g = vec_min(g, maxchar);
 610 /* 1 */      b = vec_min(b, maxchar);
 611 /* 1 */      r = vec_max(r, (vector signed short)zero);
 612 /* 1 */      g = vec_max(g, (vector signed short)zero);
 613 /* 1 */      b = vec_max(b, (vector signed short)zero);
 614
 615              /* Combine r, g and b. */
 616 /* 2 */      rgb1 = vec_perm((vector unsigned char)r, (vector unsigned char)g,
 617                              pickrg1);
 618 /* 2 */      rgb2 = vec_perm((vector unsigned char)r, (vector unsigned char)g,
 619                             pickrg2);
 620
 621 /* 2 */      rgb1 = vec_perm(rgb1, (vector unsigned char)b, pickrgb1);
 622 /* 2 */      rgb2 = vec_perm(rgb2, (vector unsigned char)b, pickrgb2);
 623
 624 /* 1 */      rgb1 = vec_or(alpha, rgb1);
 625 /* 1 */      rgb2 = vec_or(alpha, rgb2);
 626
 627 /* 3 */      vec_stl(rgb1, 0, dp1);
 628              dp1 += 16;
 629 /* 3 */      vec_stl(rgb2, 0, dp1);
 630
 631              /*
 632               * Begin the second row calculations
 633               */
 634
 635              /*
 636               * Load 4 y pixels for the 8x2 pixel block.
 637               */
 638 /* 3 */      yperm = vec_lvsl(0, yp2);
 639 /* 3 */      tmp3 = (vector signed short)vec_lde(0, (unsigned int *)yp2);
 640              yp2 += 4;
 641
 642              /* Save y and load the next 4 y pixels for a total of 8 */
 643 /* 2 */      y = vec_perm(tmp3, tmp3, yperm);
 644 /* 3 */      yperm = vec_lvsr(12, yp2);
 645 /* 3 */      tmp3 = (vector signed short)vec_lde(0, (unsigned int *)yp2);
 646 /* 1 */      y = (vector signed short)vec_mergeh(zero,
 647                                                  (vector unsigned char)y);
 648
 649              /* Avoid dependency stalls on yperm */
 650 /* 2 */      tmp3 = vec_perm(tmp3, tmp3, yperm);
 651 /* 1 */      tmp3 = (vector signed short)vec_mergeh(zero,
 652                                                     (vector unsigned char)tmp3);
 653 /* 1 */      y = vec_or(y, tmp3);
 654
 655              /* Start the calculation for g */
 656 /* 4 */      tmp2 = vec_mladd(cgu, u, (vector signed short)zero);
 657
 658              /* Finish calculating y */
 659 /* 1 */      y = vec_sub(y, c16);
 660 /* 4 */      y = vec_mladd(ymul, y, (vector signed short)zero);
 661
 662              /* Perform non-dependent multiplies first. */
 663 /* 4 */      tmp2 = vec_mladd(cgv, v, tmp2);
 664 /* 4 */      tmp1 = vec_mladd(crv, v, y);
 665 /* 4 */      tmp3 = vec_mladd(cbu, u, y);
 666
 667              /* Calculate rgb values */
 668 /* 1 */      r = vec_sra(tmp1, res);
 669
 670 /* 1 */      tmp2 = vec_sub(y, tmp2);
 671 /* 1 */      tmp2 = vec_add(tmp2, c32);
 672 /* 1 */      g = vec_sra(tmp2, res);
 673
 674 /* 1 */      tmp3 = vec_add(tmp3, c32);
 675 /* 1 */      b = vec_sra(tmp3, res);
 676
 677              /* Bound to 0 <= x <= 255 */
 678 /* 1 */      r = vec_min(r, maxchar);
 679 /* 1 */      g = vec_min(g, maxchar);
 680 /* 1 */      b = vec_min(b, maxchar);
 681 /* 1 */      r = vec_max(r, (vector signed short)zero);
 682 /* 1 */      g = vec_max(g, (vector signed short)zero);
 683 /* 1 */      b = vec_max(b, (vector signed short)zero);
 684
 685              /* Combine r, g and b. */
 686 /* 2 */      rgb1 = vec_perm((vector unsigned char)r, (vector unsigned char)g,
 687                             pickrg1);
 688 /* 2 */      rgb2 = vec_perm((vector unsigned char)r, (vector unsigned char)g,
 689                             pickrg2);
 690
 691 /* 2 */      rgb1 = vec_perm(rgb1, (vector unsigned char)b, pickrgb1);
 692 /* 2 */      rgb2 = vec_perm(rgb2, (vector unsigned char)b, pickrgb2);
 693
 694 /* 1 */      rgb1 = vec_or(alpha, rgb1);
 695 /* 1 */      rgb2 = vec_or(alpha, rgb2);
 696
 697 /* 3 */      vec_stl(rgb1, 0, dp2);
 698              dp2 += 16;
 699 /* 3 */      vec_stl(rgb2, 0, dp2);
 700
 701              /* Increment the YUV data pointers to the next set of pixels. */
 702              yp1 += 4;
 703              yp2 += 4;
 704              up += 4;
 705              vp += 4;
 706
 707              /* Move the destination pointers to the next set of pixels. */
 708              dp1 += 16;
 709              dp2 += 16;
 710           }
 711
 712         /* jump down one line since we are doing 2 at once */
 713         yp1 += w;
 714         yp2 += w;
 715         dp1 += (w * 4);
 716         dp2 += (w * 4);
 717      }
 718 #else
 719    _evas_yv12torgb_diz(yuv, rgb, w, h);
 720 #endif
 721 }
 722 #endif
 723
 724 static void
 725 _evas_yuv_init(void)
 726 {
 727 #ifdef BUILD_C
 728    int i;
 729
 730    for (i = 0; i < 256; i++)
 731      {
 732         _v1164[i] = (int)(((float)(i - 16 )) * 1.164);
 733
 734         _v1596[i] = (int)(((float)(i - 128)) * 1.596);
 735         _v813[i]  = (int)(((float)(i - 128)) * 0.813);
 736
 737         _v391[i]  = (int)(((float)(i - 128)) * 0.391);
 738         _v2018[i] = (int)(((float)(i - 128)) * 2.018);
 739      }
 740
 741    for (i = -384; i < 640; i++)
 742      {
 743         _clip_lut[i+384] = i < 0 ? 0 : (i > 255) ? 255 : i;
 744      }
 745 #endif
 746 }
 747
 748 #ifdef BUILD_ALTIVEC
 749 static void
 750 _evas_yv12torgb_diz(unsigned char **yuv, unsigned char *rgb, int w, int h)
 751 {
 752 #ifdef BUILD_C
 753    int xx, yy;
 754    int y, u, v, r, g, b;
 755    unsigned char *yp1, *yp2, *up, *vp;
 756    unsigned char *dp1, *dp2;
 757    int crv, cbu, cgu, cgv;
 758
 759    /* destination pointers */
 760    dp1 = rgb;
 761    dp2 = rgb + (w * 4);
 762
 763    crv = CRV;   /* 1.596 */
 764    cbu = CBU;   /* 2.018 */
 765    cgu = CGU;   /* 0.391 */
 766    cgv = CGV;   /* 0.813 */
 767
 768    for (yy = 0; yy < h; yy += 2)
 769      {
 770         /* plane pointers */
 771         yp1 = yuv[yy];
 772         yp2 = yuv[yy + 1];
 773         up = yuv[h + (yy / 2)];
 774         vp = yuv[h + (h / 2) + (yy / 2)];
 775         for (xx = 0; xx < w; xx += 2)
 776           {
 777              /* collect u & v for 2x2 pixel block */
 778              u = (*up++) - 128;
 779              v = (*vp++) - 128;
 780
 781              /* do the top 2 pixels of the 2x2 block which shared u & v */
 782              /* yuv to rgb */
 783              y = YMUL * ((*yp1++) - 16);
 784              r = LUT_CLIP((y + (crv * v)) >> 16);
 785              g = LUT_CLIP((y - (cgu * u) - (cgv * v) + OFF) >>16);
 786              b = LUT_CLIP((y + (cbu * u) + OFF) >> 16);
 787              *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b);
 788
 789              dp1 += 4;
 790
 791              /* yuv to rgb */
 792              y = YMUL * ((*yp1++) - 16);
 793              r = LUT_CLIP((y + (crv * v)) >> 16);
 794              g = LUT_CLIP((y - (cgu * u) - (cgv * v) + OFF) >>16);
 795              b = LUT_CLIP((y + (cbu * u) + OFF) >> 16);
 796              *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b);
 797
 798              dp1 += 4;
 799
 800              /* do the bottom 2 pixels */
 801              /* yuv to rgb */
 802              y = YMUL * ((*yp2++) - 16);
 803              r = LUT_CLIP((y + (crv * v)) >> 16);
 804              g = LUT_CLIP((y - (cgu * u) - (cgv * v) + OFF) >>16);
 805              b = LUT_CLIP((y + (cbu * u) + OFF) >> 16);
 806              *((DATA32 *) dp2) = 0xff000000 + RGB_JOIN(r,g,b);
 807
 808              dp2 += 4;
 809
 810              /* yuv to rgb */
 811              y = YMUL * ((*yp2++) - 16);
 812              r = LUT_CLIP((y + (crv * v)) >> 16);
 813              g = LUT_CLIP((y - (cgu * u) - (cgv * v) + OFF) >>16);
 814              b = LUT_CLIP((y + (cbu * u) + OFF) >> 16);
 815              *((DATA32 *) dp2) = 0xff000000 + RGB_JOIN(r,g,b);
 816
 817              dp2 += 4;
 818           }
 819         /* jump down one line since we are doing 2 at once */
 820         dp1 += (w * 4);
 821         dp2 += (w * 4);
 822      }
 823 #endif
 824 }
 825 #endif
 826
 827 static void
 828 _evas_yv12torgb_raster(unsigned char **yuv, unsigned char *rgb, int w, int h)
 829 {
 830 #ifdef BUILD_C
 831    int xx, yy;
 832    int y, u, v;
 833    unsigned char *yp1, *yp2, *up, *vp;
 834    unsigned char *dp1, *dp2;
 835
 836    /* destination pointers */
 837    dp1 = rgb;
 838    dp2 = rgb + (w * 4);
 839
 840    for (yy = 0; yy < h; yy += 2)
 841      {
 842         /* plane pointers */
 843         yp1 = yuv[yy];
 844         yp2 = yuv[yy + 1];
 845         up = yuv[h + (yy / 2)];
 846         vp = yuv[h + (h / 2) + (yy / 2)];
 847         for (xx = 0; xx < w; xx += 2)
 848           {
 849              int vmu;
 850
 851              /* collect u & v for 2x2 pixel block */
 852              u = *up++;
 853              v = *vp++;
 854
 855              /* save lookups */
 856              vmu = _v813[v] + _v391[u];
 857              u = _v2018[u];
 858              v = _v1596[v];
 859
 860              /* do the top 2 pixels of the 2x2 block which shared u & v */
 861              /* yuv to rgb */
 862              y = _v1164[*yp1++];
 863              *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
 864
 865              dp1 += 4;
 866
 867              /* yuv to rgb */
 868              y = _v1164[*yp1++];
 869              *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
 870
 871              dp1 += 4;
 872
 873              /* do the bottom 2 pixels */
 874              /* yuv to rgb */
 875              y = _v1164[*yp2++];
 876              *((DATA32 *) dp2) = 0xff000000 + RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
 877
 878              dp2 += 4;
 879
 880              /* yuv to rgb */
 881              y = _v1164[*yp2++];
 882              *((DATA32 *) dp2) = 0xff000000 + RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
 883
 884              dp2 += 4;
 885           }
 886         /* jump down one line since we are doing 2 at once */
 887         dp1 += (w * 4);
 888         dp2 += (w * 4);
 889      }
 890 #endif
 891 }
 892
 893 void
 894 evas_common_convert_yuv_422_601_rgba(DATA8 **src, DATA8 *dst, int w, int h)
 895 {
 896 #ifdef BUILD_C
 897    if (!initted) _evas_yuv_init();
 898    initted = 1;
 899    _evas_yuy2torgb_raster(src, dst, w, h);
 900 #endif
 901 }
 902
 903 void
 904 evas_common_convert_yuv_420_601_rgba(DATA8 **src, DATA8 *dst, int w, int h)
 905 {
 906 #ifdef BUILD_C
 907    if (!initted) _evas_yuv_init();
 908    initted = 1;
 909    _evas_nv12torgb_raster(src, dst, w, h);
 910 #endif
 911 }
 912
 913 void
 914 evas_common_convert_yuv_420T_601_rgba(DATA8 **src, DATA8 *dst, int w, int h)
 915 {
 916 #ifdef BUILD_C
 917    if (initted) _evas_yuv_init();
 918    initted = 1;
 919    _evas_nv12tiledtorgb_raster(src, dst, w, h);
 920 #endif
 921 }
 922
 923 static void
 924 _evas_yuy2torgb_raster(unsigned char **yuv, unsigned char *rgb, int w, int h)
 925 {
 926 #ifdef BUILD_C
 927    int xx, yy;
 928    int y, u, v;
 929    unsigned char *yp1, *yp2, *up, *vp;
 930    unsigned char *dp1;
 931
 932    dp1 = rgb;
 933
 934    /* destination pointers */
 935    for (yy = 0; yy < h; yy++)
 936      {
 937         /* plane pointers */
 938         unsigned char *line;
 939
 940         line = yuv[yy];
 941         yp1 = line + 0;
 942         up = line + 1;
 943         yp2 = line + 2;
 944         vp = line + 3;
 945
 946         for (xx = 0; xx < w; xx += 2)
 947           {
 948              int vmu;
 949
 950              /* collect u & v for 2 pixels block */
 951              u = *up;
 952              v = *vp;
 953
 954              /* save lookups */
 955              vmu = _v813[v] + _v391[u];
 956              u = _v2018[u];
 957              v = _v1596[v];
 958
 959              /* do the top 2 pixels of the 2x2 block which shared u & v */
 960              /* yuv to rgb */
 961              y = _v1164[*yp1];
 962              *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
 963
 964              dp1 += 4;
 965
 966              /* yuv to rgb */
 967              y = _v1164[*yp2];
 968              *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
 969
 970              dp1 += 4;
 971
 972              yp1 += 4; yp2 += 4; up += 4; vp += 4;
 973           }
 974      }
 975 #endif
 976 }
 977
 978 #ifdef BUILD_C
 979 static inline void
 980 _evas_yuv2rgb_420_raster(unsigned char *yp1, unsigned char *yp2, unsigned char *up, unsigned char *vp,
 981                          unsigned char *dp1, unsigned char *dp2)
 982 {
 983    int y, u, v;
 984    int vmu;
 985    int rgb;
 986
 987    /* collect u & v for 4 pixels block */
 988    u = *up;
 989    v = *vp;
 990
 991    /* save lookups */
 992 #ifdef MEM_BP
 993    vmu = _v813[v] + _v391[u];
 994    u = _v2018[u];
 995    v = _v1596[v];
 996 #else
 997    u -= 128;
 998    v -= 128;
 999    vmu = v * CGV + u * CGU;
1000    u = u * CBU;
1001    v = v * CRV;
1002 #endif
1003
1004    /* do the top 2 pixels of the 2x2 block which shared u & v */
1005    /* yuv to rgb */
1006 #ifdef MEM_BP
1007    y = _v1164[*yp1];
1008    rgb = RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
1009 #else
1010    y = (*yp1 - 16 ) * YMUL;
1011    rgb = RGB_JOIN(LUT_CLIP(((y + v) >> 16)),
1012                   LUT_CLIP(((y - vmu + OFF) >> 16)),
1013                   LUT_CLIP(((y + u + OFF) >> 16)));
1014 #endif
1015    *((DATA32 *) dp1) = 0xff000000 + rgb;
1016
1017    dp1 += 4; yp1++;
1018
1019    /* yuv to rgb */
1020 #ifdef MEM_BP
1021    y = _v1164[*yp1];
1022    rgb = RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
1023 #else
1024    y = (*yp1 - 16 ) * YMUL;
1025    rgb = RGB_JOIN(LUT_CLIP(((y + v) >> 16)),
1026                   LUT_CLIP(((y - vmu + OFF) >> 16)),
1027                   LUT_CLIP(((y + u + OFF) >> 16)));
1028 #endif
1029    *((DATA32 *) dp1) = 0xff000000 + rgb;
1030
1031    /* do the bottom 2 pixels of the 2x2 block which shared u & v */
1032    /* yuv to rgb */
1033 #ifdef MEM_BP
1034    y = _v1164[*yp2];
1035    rgb = RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
1036 #else
1037    y = (*yp2 - 16 ) * YMUL;
1038    rgb = RGB_JOIN(LUT_CLIP(((y + v) >> 16)),
1039                   LUT_CLIP(((y - vmu + OFF) >> 16)),
1040                   LUT_CLIP(((y + u + OFF) >> 16)));
1041 #endif
1042    *((DATA32 *) dp2) = 0xff000000 + rgb;
1043
1044    dp2 += 4; yp2++;
1045
1046    /* yuv to rgb */
1047 #ifdef MEM_BP
1048    y = _v1164[*yp2];
1049    rgb = RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
1050 #else
1051    y = (*yp2 - 16 ) * YMUL;
1052    rgb = RGB_JOIN(LUT_CLIP(((y + v) >> 16)),
1053                   LUT_CLIP(((y - vmu + OFF) >> 16)),
1054                   LUT_CLIP(((y + u + OFF) >> 16)));
1055 #endif
1056    *((DATA32 *) dp2) = 0xff000000 + rgb;
1057 }
1058 #endif
1059
1060 static void
1061 _evas_nv12tiledtorgb_raster(unsigned char **yuv, unsigned char *rgb, int w, int h)
1062 {
1063 #ifdef BUILD_C
1064
1065 #define HANDLE_MACROBLOCK(YP1, YP2, UP, VP, DP1, DP2)                   \
1066    {                                                                    \
1067      int i;                                                             \
1068      int j;                                                             \
1069                                                                         \
1070      for (i = 0; i < 32; i += 2)                                        \
1071        {                                                                \
1072           for (j = 0; j < 64; j += 2)                                   \
1073             {                                                           \
1074                _evas_yuv2rgb_420_raster(YP1, YP2, UP, VP, DP1, DP2);    \
1075                                                                         \
1076                /* the previous call just rendered 2 pixels per lines */ \
1077                DP1 += 8; DP2 += 8;                                      \
1078                                                                         \
1079                /* and took for that 2 lines with 2 Y, 1 U and 1 V. Don't forget U & V are in the same plane */ \
1080                YP1 += 2; YP2 += 2; UP += 2; VP += 2;                    \
1081             }                                                           \
1082                                                                         \
1083           DP1 += sizeof (int) * ((w << 1) - 64);                        \
1084           DP2 += sizeof (int) * ((w << 1) - 64);                        \
1085           YP1 += 64;                                                    \
1086           YP2 += 64;                                                    \
1087        }                                                                \
1088    }
1089
1090    /* One macro block is 32 lines of Y and 16 lines of UV */
1091    const int offset_value[2] = { 0, 64 * 16 };
1092    int mb_x, mb_y, mb_w, mb_h;
1093    int base_h;
1094    int uv_x, uv_y, uv_step;
1095    int stride;
1096
1097    /* Idea iterate over each macroblock and convert each of them using _evas_nv12torgb_raster */
1098
1099    /* The layout of the Y macroblock order in RGB non tiled space : */
1100    /* --------------------------------------------------- */
1101    /* | 0  | 1  | 6  | 7  | 8  | 9  | 14 | 15 | 16 | 17 | */
1102    /* --------------------------------------------------- */
1103    /* | 2  | 3  | 4  | 5  | 10 | 11 | 12 | 13 | 18 | 19 | */
1104    /* --------------------------------------------------- */
1105    /* | 20 | 21 | 26 | 27 | 28 | 29 | 34 | 35 | 36 | 37 | */
1106    /* --------------------------------------------------- */
1107    /* | 22 | 23 | 24 | 25 | 30 | 31 | 32 | 33 | 38 | 39 | */
1108    /* --------------------------------------------------- */
1109    /* | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | */
1110    /* --------------------------------------------------- */
1111    /* The layout of the UV macroblock order in the same RGB non tiled space : */
1112    /* --------------------------------------------------- */
1113    /* |    |    |    |    |    |    |    |    |    |    | */
1114    /* - 0  - 1  - 6  - 7  - 8  - 9  - 14 - 15 - 16 - 17 - */
1115    /* |    |    |    |    |    |    |    |    |    |    | */
1116    /* --------------------------------------------------- */
1117    /* |    |    |    |    |    |    |    |    |    |    | */
1118    /* - 2  - 3  - 4  - 5  - 10 - 11 - 12 - 13 - 18 - 19 - */
1119    /* |    |    |    |    |    |    |    |    |    |    | */
1120    /* --------------------------------------------------- */
1121    /* |    |    |    |    |    |    |    |    |    |    | */
1122    /* - 20 - 21 - 22 - 22 - 23 - 24 - 25 - 26 - 27 - 28 - */
1123
1124    /* the number of macroblock should be a multiple of 64x32 */
1125    mb_w = w / 64;
1126    mb_h = h / 32;
1127
1128    base_h = (mb_h >> 1) + (mb_h & 0x1);
1129    stride = w * sizeof (int);
1130
1131    uv_x = 0; uv_y = 0;
1132
1133    /* In this format we linearize macroblock on two line to form a Z and it's invert */
1134    for (mb_y = 0; mb_y < (mb_h >> 1); mb_y++)
1135      {
1136         int step = 2;
1137         int offset = 0;
1138         int x = 0;
1139         int rmb_x = 0;
1140         int ry[2];
1141
1142         ry[0] = mb_y * 2 * 32 * stride;
1143         ry[1] = ry[0] + 32 * stride;
1144
1145         uv_step = (mb_y & 0x1) == 0 ? 4 : 0;
1146         uv_x = (mb_y & 0x1) == 0 ? 0 : 2 * 64 * 32;
1147
1148         for (mb_x = 0; mb_x < mb_w * 2; mb_x++, rmb_x += 64 * 32)
1149           {
1150             unsigned char *yp1, *yp2, *up, *vp;
1151             unsigned char *dp1, *dp2;
1152
1153             dp1 = rgb + x + ry[offset];
1154             dp2 = dp1 + stride;
1155
1156             yp1 = yuv[mb_y] + rmb_x;
1157             yp2 = yp1 + 64;
1158
1159             /* UV plane is two time less bigger in pixel count, but it old two bytes each times */
1160             up = yuv[(mb_y >> 1) + base_h] + uv_x + offset_value[offset];
1161             vp = up + 1;
1162
1163             HANDLE_MACROBLOCK(yp1, yp2, up, vp, dp1, dp2);
1164
1165             step++;
1166             if ((step & 0x3) == 0)
1167               {
1168                 offset = 1 - offset;
1169                 x -= 64 * sizeof (int);
1170                 uv_x -= 64 * 32;
1171               }
1172             else
1173               {
1174                 x += 64 * sizeof (int);
1175                 uv_x += 64 * 32;
1176               }
1177
1178             uv_step++;
1179             if (uv_step == 8)
1180               {
1181                 uv_step = 0;
1182                 uv_x += 4 * 64 * 32;
1183               }
1184           }
1185      }
1186
1187    if (mb_h & 0x1)
1188      {
1189         int x = 0;
1190         int ry;
1191
1192         ry = mb_y << 1;
1193
1194         uv_step = 0;
1195         uv_x = 0;
1196
1197         for (mb_x = 0; mb_x < mb_w; mb_x++, x++, uv_x++)
1198           {
1199              unsigned char *yp1, *yp2, *up, *vp;
1200              unsigned char *dp1, *dp2;
1201
1202              dp1 = rgb + (x * 64 + (ry * 32 * w)) * sizeof (int);
1203              dp2 = dp1 + sizeof (int) * w;
1204
1205              yp1 = yuv[mb_y] + mb_x * 64 * 32;
1206              yp2 = yp1 + 64;
1207
1208              up = yuv[mb_y / 2 + base_h] + uv_x * 64 * 32;
1209              vp = up + 1;
1210
1211              HANDLE_MACROBLOCK(yp1, yp2, up, vp, dp1, dp2);
1212           }
1213      }
1214 #endif
1215 }
1216
1217 static void
1218 _evas_nv12torgb_raster(unsigned char **yuv, unsigned char *rgb, int w, int h)
1219 {
1220 #ifdef BUILD_C
1221    int xx, yy;
1222    unsigned char *yp1, *yp2, *up, *vp;
1223    unsigned char *dp1;
1224    unsigned char *dp2;
1225
1226    dp1 = rgb;
1227    dp2 = dp1 + sizeof (int) * w;
1228
1229    for (yy = 0; yy < h; yy++)
1230      {
1231         yp1 = yuv[yy++];
1232         yp2 = yuv[yy];
1233
1234         up = yuv[h + (yy >> 1)];
1235         vp = up + 1;
1236
1237         for (xx = 0; xx < w; xx += 2)
1238           {
1239              _evas_yuv2rgb_420_raster(yp1, yp2, up, vp, dp1, dp2);
1240
1241              /* the previous call just rendered 2 pixels per lines */
1242              dp1 += 8; dp2 += 8;
1243
1244              /* and took for that 2 lines with 2 Y, 1 U and 1 V. Don't forget U & V are in the same plane */
1245              yp1 += 2; yp2 += 2; up += 2; vp += 2;
1246           }
1247
1248         /* jump one line */
1249         dp1 += sizeof (int) * w;
1250         dp2 += sizeof (int) * w;
1251         yp1 += w;
1252         yp2 += w;
1253      }
1254 #endif
1255 }
1256
1257 #endif
1258