1 #include "evas_common.h"
2 #include "evas_convert_yuv.h"
4 #if defined BUILD_MMX || defined BUILD_SSE
8 #if defined HAVE_ALTIVEC_H
18 #ifdef BUILD_CONVERT_YUV
20 static void _evas_yuv_init (void);
21 static void _evas_yv12torgb_sse (unsigned char **yuv, unsigned char *rgb, int w, int h);
22 static void _evas_yv12torgb_mmx (unsigned char **yuv, unsigned char *rgb, int w, int h);
24 static void _evas_yv12torgb_altivec(unsigned char **yuv, unsigned char *rgb, int w, int h);
25 static void _evas_yv12torgb_diz (unsigned char **yuv, unsigned char *rgb, int w, int h);
27 static void _evas_yv12torgb_raster (unsigned char **yuv, unsigned char *rgb, int w, int h);
28 static void _evas_yuy2torgb_raster (unsigned char **yuv, unsigned char *rgb, int w, int h);
29 static void _evas_nv12torgb_raster (unsigned char **yuv, unsigned char *rgb, int w, int h);
30 static void _evas_nv12tiledtorgb_raster(unsigned char **yuv, unsigned char *rgb, int w, int h);
40 /* calculation float resolution in bits */
41 /* ie RES = 6 is 10.6 fixed point */
42 /* RES = 8 is 8.8 fixed point */
43 /* RES = 4 is 12.4 fixed point */
44 /* NB: going above 6 will lead to overflow... :( */
47 #define RZ(i) (i >> (BITRES - RES))
48 #define FOUR(i) {i, i, i, i}
50 #if defined BUILD_MMX || defined BUILD_SSE
51 __attribute__ ((aligned (8))) const volatile unsigned short _const_crvcrv[4] = FOUR(RZ(CRV));
52 __attribute__ ((aligned (8))) const volatile unsigned short _const_cbucbu[4] = FOUR(RZ(CBU));
53 __attribute__ ((aligned (8))) const volatile unsigned short _const_cgucgu[4] = FOUR(RZ(CGU));
54 __attribute__ ((aligned (8))) const volatile unsigned short _const_cgvcgv[4] = FOUR(RZ(CGV));
55 __attribute__ ((aligned (8))) const volatile unsigned short _const_ymul [4] = FOUR(RZ(YMUL));
56 __attribute__ ((aligned (8))) const volatile unsigned short _const_128 [4] = FOUR(128);
57 __attribute__ ((aligned (8))) const volatile unsigned short _const_32 [4] = FOUR(RZ(OFF));
58 __attribute__ ((aligned (8))) const volatile unsigned short _const_16 [4] = FOUR(16);
59 __attribute__ ((aligned (8))) const volatile unsigned short _const_ff [4] = FOUR(-1);
61 #define CONST_CRVCRV *_const_crvcrv
62 #define CONST_CBUCBU *_const_cbucbu
63 #define CONST_CGUCGU *_const_cgucgu
64 #define CONST_CGVCGV *_const_cgvcgv
65 #define CONST_YMUL *_const_ymul
66 #define CONST_128 *_const_128
67 #define CONST_32 *_const_32
68 #define CONST_16 *_const_16
69 #define CONST_FF *_const_ff
71 /* for C non aligned cleanup */
72 const int _crv = RZ(CRV); /* 1.596 */
73 const int _cbu = RZ(CBU); /* 2.018 */
74 const int _cgu = RZ(CGU); /* 0.391 */
75 const int _cgv = RZ(CGV); /* 0.813 */
81 const vector unsigned short res = AVV(RES);
82 const vector signed short crv = AVV(RZ(CRV));
83 const vector signed short cbu = AVV(RZ(CBU));
84 const vector signed short cgu = AVV(RZ(CGU));
85 const vector signed short cgv = AVV(RZ(CGV));
86 const vector signed short ymul = AVV(RZ(YMUL));
87 const vector signed short c128 = AVV(128);
88 const vector signed short c32 = AVV(RZ(OFF));
89 const vector signed short c16 = AVV(16);
90 const vector unsigned char zero = AVV(0);
91 const vector signed short maxchar = AVV(255);
92 const vector unsigned char pickrg1 = AVV(0, 0x1, 0x11, 0,
96 const vector unsigned char pickrg2 = AVV(0, 0x9, 0x19, 0,
100 const vector unsigned char pickrgb1 = AVV(0x3, 0x1, 0x2, 0x11,
103 0xf, 0xd, 0xe, 0x17);
104 const vector unsigned char pickrgb2 = AVV(0x3, 0x1, 0x2, 0x19,
107 0xf, 0xd, 0xe, 0x1f);
113 /* shortcut speedup lookup-tables */
114 static short _v1164[256];
115 static short _v1596[256];
116 static short _v813[256];
117 static short _v391[256];
118 static short _v2018[256];
120 static unsigned char _clip_lut[1024];
121 #define LUT_CLIP(i) ((_clip_lut+384)[(i)])
123 #define CMP_CLIP(i) ((i&256)? (~(i>>10)) : i);
125 static int initted = 0;
130 evas_common_convert_yuv_420p_601_rgba(DATA8 **src, DATA8 *dst, int w, int h)
134 #if defined BUILD_MMX || defined BUILD_SSE
135 evas_common_cpu_can_do(&mmx, &sse, &sse2);
144 if (evas_common_cpu_has_feature(CPU_FEATURE_MMX2))
145 _evas_yv12torgb_sse(src, dst, w, h);
146 else if (evas_common_cpu_has_feature(CPU_FEATURE_MMX))
147 _evas_yv12torgb_mmx(src, dst, w, h);
149 if (evas_common_cpu_has_feature(CPU_FEATURE_ALTIVEC))
150 _evas_yv12torgb_altivec(src, dst, w, h);
155 if (!initted) _evas_yuv_init();
157 /* FIXME: diz may be faster sometimes */
158 _evas_yv12torgb_raster(src, dst, w, h);
163 /* Thanks to Diz for this code. i've munged it a little and turned it into */
164 /* inline macros. I tried beating it with a different algorithm using MMX */
165 /* but failed. So here we are. This is the fastest YUV->RGB i know of for */
166 /* x86. It has an issue that it doesn't convert colours accurately so the */
167 /* image looks a little "yellowy". This is a result of only 10.6 fixed point */
168 /* resolution as opposed to 16.16 in the C code. This could be fixed by */
169 /* processing half the number of pixels per cycle and going up to 32bits */
170 /* per element during compute, but it would all but negate the speedup */
171 /* from mmx I think :( It might be possible to use SSE and SSE2 here, but */
172 /* I haven't tried yet. Let's see. */
174 /* NB: XviD has almost the same code in it's assembly YV12->RGB code. same */
175 /* algorithm, same constants, same all over actually, except it actually */
176 /* does a few extra memory accesses that this one doesn't, so in theory */
177 /* this code should be faster. In the end it's all just an mmx version of */
178 /* the reference implimentation done with fixed point math */
181 _evas_yv12torgb_sse(unsigned char **yuv, unsigned char *rgb, int w, int h)
185 register unsigned char *yp1, *up, *vp;
188 /* destination pointers */
191 for (yy = 0; yy < h; yy++)
195 up = yuv[h + (yy / 2)];
196 vp = yuv[h + (h / 2) + (yy / 2)];
197 for (xx = 0; xx < (w - 7); xx += 8)
204 punpcklbw_r2r(mm7, mm2);
205 punpcklbw_r2r(mm7, mm3);
212 movq_m2r(CONST_16, mm4);
213 psubsw_r2r(mm4, mm0);
214 psubsw_r2r(mm4, mm1);
216 movq_m2r(CONST_128, mm5);
217 psubsw_r2r(mm5, mm2);
218 psubsw_r2r(mm5, mm3);
220 movq_m2r(CONST_YMUL, mm4);
221 pmullw_r2r(mm4, mm0);
222 pmullw_r2r(mm4, mm1);
224 movq_m2r(CONST_CRVCRV, mm7);
225 pmullw_r2r(mm3, mm7);
226 movq_m2r(CONST_CBUCBU, mm6);
227 pmullw_r2r(mm2, mm6);
228 movq_m2r(CONST_CGUCGU, mm5);
229 pmullw_r2r(mm2, mm5);
230 movq_m2r(CONST_CGVCGV, mm4);
231 pmullw_r2r(mm3, mm4);
234 paddsw_r2r(mm7, mm2);
235 paddsw_r2r(mm1, mm7);
239 packuswb_r2r(mm7, mm2);
243 punpckhbw_r2r(mm7, mm2);
244 punpcklbw_r2r(mm3, mm7);
248 psubsw_r2r(mm5, mm3);
249 psubsw_r2r(mm4, mm3);
250 paddsw_m2r(CONST_32, mm3);
253 psubsw_r2r(mm5, mm7);
254 psubsw_r2r(mm4, mm7);
255 paddsw_m2r(CONST_32, mm7);
259 packuswb_r2r(mm7, mm3);
263 punpckhbw_r2r(mm7, mm3);
264 punpcklbw_r2r(mm4, mm7);
267 movq_m2r(CONST_32, mm4);
268 paddsw_r2r(mm6, mm0);
269 paddsw_r2r(mm6, mm1);
270 paddsw_r2r(mm4, mm0);
271 paddsw_r2r(mm4, mm1);
274 packuswb_r2r(mm1, mm0);
278 punpckhbw_r2r(mm7, mm0);
279 punpcklbw_r2r(mm5, mm7);
282 movq_m2r(CONST_FF, mm1);
286 punpckhbw_r2r(mm3, mm2);
287 punpcklbw_r2r(mm6, mm7);
288 punpckhbw_r2r(mm1, mm0);
289 punpcklbw_r2r(mm1, mm5);
292 punpckhwd_r2r(mm5, mm7);
293 punpcklwd_r2r(mm5, mm1);
296 punpckhwd_r2r(mm0, mm2);
297 punpcklwd_r2r(mm0, mm4);
299 movntq_r2m(mm1, *(dp1));
300 movntq_r2m(mm7, *(dp1 + 8));
301 movntq_r2m(mm4, *(dp1 + 16));
302 movntq_r2m(mm2, *(dp1 + 24));
309 /* cleanup pixles that arent a multiple of 8 pixels wide */
312 int y, u, v, r, g, b;
314 for (; xx < w; xx += 2)
319 y = RZ(YMUL) * ((*yp1++) - 16);
320 r = LUT_CLIP((y + (_crv * v)) >> RES);
321 g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES);
322 b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES);
323 *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b);
327 y = RZ(YMUL) * ((*yp1++) - 16);
328 r = LUT_CLIP((y + (_crv * v)) >> RES);
329 g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES);
330 b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES);
331 *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b);
339 _evas_yv12torgb_mmx(yuv, rgb, w, h);
344 _evas_yv12torgb_mmx(unsigned char **yuv, unsigned char *rgb, int w, int h)
348 register unsigned char *yp1, *up, *vp;
351 /* destination pointers */
354 for (yy = 0; yy < h; yy++)
358 up = yuv[h + (yy / 2)];
359 vp = yuv[h + (h / 2) + (yy / 2)];
360 for (xx = 0; xx < (w - 7); xx += 8)
367 punpcklbw_r2r(mm7, mm2);
368 punpcklbw_r2r(mm7, mm3);
375 movq_m2r(CONST_16, mm4);
376 psubsw_r2r(mm4, mm0);
377 psubsw_r2r(mm4, mm1);
379 movq_m2r(CONST_128, mm5);
380 psubsw_r2r(mm5, mm2);
381 psubsw_r2r(mm5, mm3);
383 movq_m2r(CONST_YMUL, mm4);
384 pmullw_r2r(mm4, mm0);
385 pmullw_r2r(mm4, mm1);
387 movq_m2r(CONST_CRVCRV, mm7);
388 pmullw_r2r(mm3, mm7);
389 movq_m2r(CONST_CBUCBU, mm6);
390 pmullw_r2r(mm2, mm6);
391 movq_m2r(CONST_CGUCGU, mm5);
392 pmullw_r2r(mm2, mm5);
393 movq_m2r(CONST_CGVCGV, mm4);
394 pmullw_r2r(mm3, mm4);
397 paddsw_r2r(mm7, mm2);
398 paddsw_r2r(mm1, mm7);
402 packuswb_r2r(mm7, mm2);
406 punpckhbw_r2r(mm7, mm2);
407 punpcklbw_r2r(mm3, mm7);
411 psubsw_r2r(mm5, mm3);
412 psubsw_r2r(mm4, mm3);
413 paddsw_m2r(CONST_32, mm3);
416 psubsw_r2r(mm5, mm7);
417 psubsw_r2r(mm4, mm7);
418 paddsw_m2r(CONST_32, mm7);
422 packuswb_r2r(mm7, mm3);
426 punpckhbw_r2r(mm7, mm3);
427 punpcklbw_r2r(mm4, mm7);
430 movq_m2r(CONST_32, mm4);
431 paddsw_r2r(mm6, mm0);
432 paddsw_r2r(mm6, mm1);
433 paddsw_r2r(mm4, mm0);
434 paddsw_r2r(mm4, mm1);
437 packuswb_r2r(mm1, mm0);
441 punpckhbw_r2r(mm7, mm0);
442 punpcklbw_r2r(mm5, mm7);
445 movq_m2r(CONST_FF, mm1);
449 punpckhbw_r2r(mm3, mm2);
450 punpcklbw_r2r(mm6, mm7);
451 punpckhbw_r2r(mm1, mm0);
452 punpcklbw_r2r(mm1, mm5);
455 punpckhwd_r2r(mm5, mm7);
456 punpcklwd_r2r(mm5, mm1);
459 punpckhwd_r2r(mm0, mm2);
460 punpcklwd_r2r(mm0, mm4);
462 movq_r2m(mm1, *(dp1));
463 movq_r2m(mm7, *(dp1 + 8));
464 movq_r2m(mm4, *(dp1 + 16));
465 movq_r2m(mm2, *(dp1 + 24));
472 /* cleanup pixles that arent a multiple of 8 pixels wide */
475 int y, u, v, r, g, b;
477 for (; xx < w; xx += 2)
482 y = RZ(YMUL) * ((*yp1++) - 16);
483 r = LUT_CLIP((y + (_crv * v)) >> RES);
484 g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES);
485 b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES);
486 *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b);
490 y = RZ(YMUL) * ((*yp1++) - 16);
491 r = LUT_CLIP((y + (_crv * v)) >> RES);
492 g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES);
493 b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES);
494 *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b);
502 _evas_yv12torgb_raster(yuv, rgb, w, h);
508 _evas_yv12torgb_altivec(unsigned char **yuv, unsigned char *rgb, int w, int h)
513 unsigned char *yp1, *yp2, *up, *vp;
514 unsigned char *dp1, *dp2;
515 vector signed short y, u, v;
516 vector signed short r, g, b;
517 vector signed short tmp1, tmp2, tmp3;
518 vector unsigned char yperm, uperm, vperm, rgb1, rgb2;
519 vector unsigned char alpha;
521 /* handy halved w & h */
529 /* destination pointers */
533 alpha = vec_mergeh((vector unsigned char)AVV(255), zero);
534 alpha = (vector unsigned char)vec_mergeh((vector unsigned short)alpha,
535 (vector unsigned short)zero);
537 for (yy = 0; yy < h2; yy++)
539 for (xx = 0; xx < w2; xx += 4)
543 * Load 4 y and 4 u & v pixels for the 8x2 pixel block.
545 /* 3 */ tmp3 = (vector signed short)vec_lde(0, (unsigned int *)yp1);
546 /* 3 */ tmp1 = (vector signed short)vec_lde(0, (unsigned int *)up);
547 /* 3 */ tmp2 = (vector signed short)vec_lde(0, (unsigned int *)vp);
549 /* Prepare for aligning the data in their vectors */
550 /* 3 */ yperm = vec_lvsl(0, yp1);
551 /* 3 */ uperm = vec_lvsl(0, up);
552 /* 3 */ vperm = vec_lvsl(0, vp);
555 /* Save y and load the next 4 y pixels for a total of 8 */
556 /* 2 */ y = vec_perm(tmp3, tmp3, yperm);
557 /* 3 */ tmp3 = (vector signed short)vec_lde(0, (unsigned int *)yp1);
559 /* Setup and calculate the 4 u pixels */
560 /* 2 */ tmp1 = vec_perm(tmp1, tmp1, uperm);
561 /* 2 */ tmp2 = vec_perm(tmp2, tmp2, vperm);
563 /* Avoid dependency stalls on yperm and calculate the 4 u values */
564 /* 3 */ yperm = vec_lvsr(12, yp1);
565 /* 1 */ tmp1 = (vector signed short)vec_mergeh((vector unsigned char)tmp1,
566 (vector unsigned char)tmp1);
567 /* 1 */ u = (vector signed short)vec_mergeh(zero,
568 (vector unsigned char)tmp1);
570 /* 1 */ u = vec_sub(u, c128);
571 /* 2 */ tmp3 = vec_perm(tmp3, tmp3, yperm);
573 /* Setup and calculate the 4 v values */
574 /* 1 */ tmp2 = (vector signed short)vec_mergeh((vector unsigned char)tmp2,
575 (vector unsigned char)tmp2);
576 /* 1 */ v = (vector signed short)vec_mergeh(zero,
577 (vector unsigned char)tmp2);
578 /* 4 */ tmp2 = vec_mladd(cgu, u, (vector signed short)zero);
579 /* 1 */ v = vec_sub(v, c128);
581 /* Move the data into y and start loading the next 4 pixels */
582 /* 1 */ y = (vector signed short)vec_mergeh(zero,
583 (vector unsigned char)y);
584 /* 1 */ tmp3 = (vector signed short)vec_mergeh(zero,
585 (vector unsigned char)tmp3);
586 /* 1 */ y = vec_or(y, tmp3);
588 /* Finish calculating y */
589 /* 1 */ y = vec_sub(y, c16);
590 /* 4 */ y = vec_mladd(ymul, y, (vector signed short)zero);
592 /* Perform non-dependent multiplies first. */
593 /* 4 */ tmp1 = vec_mladd(crv, v, y);
594 /* 4 */ tmp2 = vec_mladd(cgv, v, tmp2);
595 /* 4 */ tmp3 = vec_mladd(cbu, u, y);
597 /* Calculate rgb values */
598 /* 1 */ r = vec_sra(tmp1, res);
600 /* 1 */ tmp2 = vec_sub(y, tmp2);
601 /* 1 */ tmp2 = vec_add(tmp2, c32);
602 /* 1 */ g = vec_sra(tmp2, res);
604 /* 1 */ tmp3 = vec_add(tmp3, c32);
605 /* 1 */ b = vec_sra(tmp3, res);
607 /* Bound to 0 <= x <= 255 */
608 /* 1 */ r = vec_min(r, maxchar);
609 /* 1 */ g = vec_min(g, maxchar);
610 /* 1 */ b = vec_min(b, maxchar);
611 /* 1 */ r = vec_max(r, (vector signed short)zero);
612 /* 1 */ g = vec_max(g, (vector signed short)zero);
613 /* 1 */ b = vec_max(b, (vector signed short)zero);
615 /* Combine r, g and b. */
616 /* 2 */ rgb1 = vec_perm((vector unsigned char)r, (vector unsigned char)g,
618 /* 2 */ rgb2 = vec_perm((vector unsigned char)r, (vector unsigned char)g,
621 /* 2 */ rgb1 = vec_perm(rgb1, (vector unsigned char)b, pickrgb1);
622 /* 2 */ rgb2 = vec_perm(rgb2, (vector unsigned char)b, pickrgb2);
624 /* 1 */ rgb1 = vec_or(alpha, rgb1);
625 /* 1 */ rgb2 = vec_or(alpha, rgb2);
627 /* 3 */ vec_stl(rgb1, 0, dp1);
629 /* 3 */ vec_stl(rgb2, 0, dp1);
632 * Begin the second row calculations
636 * Load 4 y pixels for the 8x2 pixel block.
638 /* 3 */ yperm = vec_lvsl(0, yp2);
639 /* 3 */ tmp3 = (vector signed short)vec_lde(0, (unsigned int *)yp2);
642 /* Save y and load the next 4 y pixels for a total of 8 */
643 /* 2 */ y = vec_perm(tmp3, tmp3, yperm);
644 /* 3 */ yperm = vec_lvsr(12, yp2);
645 /* 3 */ tmp3 = (vector signed short)vec_lde(0, (unsigned int *)yp2);
646 /* 1 */ y = (vector signed short)vec_mergeh(zero,
647 (vector unsigned char)y);
649 /* Avoid dependency stalls on yperm */
650 /* 2 */ tmp3 = vec_perm(tmp3, tmp3, yperm);
651 /* 1 */ tmp3 = (vector signed short)vec_mergeh(zero,
652 (vector unsigned char)tmp3);
653 /* 1 */ y = vec_or(y, tmp3);
655 /* Start the calculation for g */
656 /* 4 */ tmp2 = vec_mladd(cgu, u, (vector signed short)zero);
658 /* Finish calculating y */
659 /* 1 */ y = vec_sub(y, c16);
660 /* 4 */ y = vec_mladd(ymul, y, (vector signed short)zero);
662 /* Perform non-dependent multiplies first. */
663 /* 4 */ tmp2 = vec_mladd(cgv, v, tmp2);
664 /* 4 */ tmp1 = vec_mladd(crv, v, y);
665 /* 4 */ tmp3 = vec_mladd(cbu, u, y);
667 /* Calculate rgb values */
668 /* 1 */ r = vec_sra(tmp1, res);
670 /* 1 */ tmp2 = vec_sub(y, tmp2);
671 /* 1 */ tmp2 = vec_add(tmp2, c32);
672 /* 1 */ g = vec_sra(tmp2, res);
674 /* 1 */ tmp3 = vec_add(tmp3, c32);
675 /* 1 */ b = vec_sra(tmp3, res);
677 /* Bound to 0 <= x <= 255 */
678 /* 1 */ r = vec_min(r, maxchar);
679 /* 1 */ g = vec_min(g, maxchar);
680 /* 1 */ b = vec_min(b, maxchar);
681 /* 1 */ r = vec_max(r, (vector signed short)zero);
682 /* 1 */ g = vec_max(g, (vector signed short)zero);
683 /* 1 */ b = vec_max(b, (vector signed short)zero);
685 /* Combine r, g and b. */
686 /* 2 */ rgb1 = vec_perm((vector unsigned char)r, (vector unsigned char)g,
688 /* 2 */ rgb2 = vec_perm((vector unsigned char)r, (vector unsigned char)g,
691 /* 2 */ rgb1 = vec_perm(rgb1, (vector unsigned char)b, pickrgb1);
692 /* 2 */ rgb2 = vec_perm(rgb2, (vector unsigned char)b, pickrgb2);
694 /* 1 */ rgb1 = vec_or(alpha, rgb1);
695 /* 1 */ rgb2 = vec_or(alpha, rgb2);
697 /* 3 */ vec_stl(rgb1, 0, dp2);
699 /* 3 */ vec_stl(rgb2, 0, dp2);
701 /* Increment the YUV data pointers to the next set of pixels. */
707 /* Move the destination pointers to the next set of pixels. */
712 /* jump down one line since we are doing 2 at once */
719 _evas_yv12torgb_diz(yuv, rgb, w, h);
730 for (i = 0; i < 256; i++)
732 _v1164[i] = (int)(((float)(i - 16 )) * 1.164);
734 _v1596[i] = (int)(((float)(i - 128)) * 1.596);
735 _v813[i] = (int)(((float)(i - 128)) * 0.813);
737 _v391[i] = (int)(((float)(i - 128)) * 0.391);
738 _v2018[i] = (int)(((float)(i - 128)) * 2.018);
741 for (i = -384; i < 640; i++)
743 _clip_lut[i+384] = i < 0 ? 0 : (i > 255) ? 255 : i;
750 _evas_yv12torgb_diz(unsigned char **yuv, unsigned char *rgb, int w, int h)
754 int y, u, v, r, g, b;
755 unsigned char *yp1, *yp2, *up, *vp;
756 unsigned char *dp1, *dp2;
757 int crv, cbu, cgu, cgv;
759 /* destination pointers */
763 crv = CRV; /* 1.596 */
764 cbu = CBU; /* 2.018 */
765 cgu = CGU; /* 0.391 */
766 cgv = CGV; /* 0.813 */
768 for (yy = 0; yy < h; yy += 2)
773 up = yuv[h + (yy / 2)];
774 vp = yuv[h + (h / 2) + (yy / 2)];
775 for (xx = 0; xx < w; xx += 2)
777 /* collect u & v for 2x2 pixel block */
781 /* do the top 2 pixels of the 2x2 block which shared u & v */
783 y = YMUL * ((*yp1++) - 16);
784 r = LUT_CLIP((y + (crv * v)) >> 16);
785 g = LUT_CLIP((y - (cgu * u) - (cgv * v) + OFF) >>16);
786 b = LUT_CLIP((y + (cbu * u) + OFF) >> 16);
787 *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b);
792 y = YMUL * ((*yp1++) - 16);
793 r = LUT_CLIP((y + (crv * v)) >> 16);
794 g = LUT_CLIP((y - (cgu * u) - (cgv * v) + OFF) >>16);
795 b = LUT_CLIP((y + (cbu * u) + OFF) >> 16);
796 *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b);
800 /* do the bottom 2 pixels */
802 y = YMUL * ((*yp2++) - 16);
803 r = LUT_CLIP((y + (crv * v)) >> 16);
804 g = LUT_CLIP((y - (cgu * u) - (cgv * v) + OFF) >>16);
805 b = LUT_CLIP((y + (cbu * u) + OFF) >> 16);
806 *((DATA32 *) dp2) = 0xff000000 + RGB_JOIN(r,g,b);
811 y = YMUL * ((*yp2++) - 16);
812 r = LUT_CLIP((y + (crv * v)) >> 16);
813 g = LUT_CLIP((y - (cgu * u) - (cgv * v) + OFF) >>16);
814 b = LUT_CLIP((y + (cbu * u) + OFF) >> 16);
815 *((DATA32 *) dp2) = 0xff000000 + RGB_JOIN(r,g,b);
819 /* jump down one line since we are doing 2 at once */
828 _evas_yv12torgb_raster(unsigned char **yuv, unsigned char *rgb, int w, int h)
833 unsigned char *yp1, *yp2, *up, *vp;
834 unsigned char *dp1, *dp2;
836 /* destination pointers */
840 for (yy = 0; yy < h; yy += 2)
845 up = yuv[h + (yy / 2)];
846 vp = yuv[h + (h / 2) + (yy / 2)];
847 for (xx = 0; xx < w; xx += 2)
851 /* collect u & v for 2x2 pixel block */
856 vmu = _v813[v] + _v391[u];
860 /* do the top 2 pixels of the 2x2 block which shared u & v */
863 *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
869 *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
873 /* do the bottom 2 pixels */
876 *((DATA32 *) dp2) = 0xff000000 + RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
882 *((DATA32 *) dp2) = 0xff000000 + RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
886 /* jump down one line since we are doing 2 at once */
894 evas_common_convert_yuv_422_601_rgba(DATA8 **src, DATA8 *dst, int w, int h)
897 if (!initted) _evas_yuv_init();
899 _evas_yuy2torgb_raster(src, dst, w, h);
904 evas_common_convert_yuv_420_601_rgba(DATA8 **src, DATA8 *dst, int w, int h)
907 if (!initted) _evas_yuv_init();
909 _evas_nv12torgb_raster(src, dst, w, h);
914 evas_common_convert_yuv_420T_601_rgba(DATA8 **src, DATA8 *dst, int w, int h)
917 if (initted) _evas_yuv_init();
919 _evas_nv12tiledtorgb_raster(src, dst, w, h);
924 _evas_yuy2torgb_raster(unsigned char **yuv, unsigned char *rgb, int w, int h)
929 unsigned char *yp1, *yp2, *up, *vp;
934 /* destination pointers */
935 for (yy = 0; yy < h; yy++)
946 for (xx = 0; xx < w; xx += 2)
950 /* collect u & v for 2 pixels block */
955 vmu = _v813[v] + _v391[u];
959 /* do the top 2 pixels of the 2x2 block which shared u & v */
962 *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
968 *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
972 yp1 += 4; yp2 += 4; up += 4; vp += 4;
980 _evas_yuv2rgb_420_raster(unsigned char *yp1, unsigned char *yp2, unsigned char *up, unsigned char *vp,
981 unsigned char *dp1, unsigned char *dp2)
987 /* collect u & v for 4 pixels block */
993 vmu = _v813[v] + _v391[u];
999 vmu = v * CGV + u * CGU;
1004 /* do the top 2 pixels of the 2x2 block which shared u & v */
1008 rgb = RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
1010 y = (*yp1 - 16 ) * YMUL;
1011 rgb = RGB_JOIN(LUT_CLIP(((y + v) >> 16)),
1012 LUT_CLIP(((y - vmu + OFF) >> 16)),
1013 LUT_CLIP(((y + u + OFF) >> 16)));
1015 *((DATA32 *) dp1) = 0xff000000 + rgb;
1022 rgb = RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
1024 y = (*yp1 - 16 ) * YMUL;
1025 rgb = RGB_JOIN(LUT_CLIP(((y + v) >> 16)),
1026 LUT_CLIP(((y - vmu + OFF) >> 16)),
1027 LUT_CLIP(((y + u + OFF) >> 16)));
1029 *((DATA32 *) dp1) = 0xff000000 + rgb;
1031 /* do the bottom 2 pixels of the 2x2 block which shared u & v */
1035 rgb = RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
1037 y = (*yp2 - 16 ) * YMUL;
1038 rgb = RGB_JOIN(LUT_CLIP(((y + v) >> 16)),
1039 LUT_CLIP(((y - vmu + OFF) >> 16)),
1040 LUT_CLIP(((y + u + OFF) >> 16)));
1042 *((DATA32 *) dp2) = 0xff000000 + rgb;
1049 rgb = RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
1051 y = (*yp2 - 16 ) * YMUL;
1052 rgb = RGB_JOIN(LUT_CLIP(((y + v) >> 16)),
1053 LUT_CLIP(((y - vmu + OFF) >> 16)),
1054 LUT_CLIP(((y + u + OFF) >> 16)));
1056 *((DATA32 *) dp2) = 0xff000000 + rgb;
1061 _evas_nv12tiledtorgb_raster(unsigned char **yuv, unsigned char *rgb, int w, int h)
1065 #define HANDLE_MACROBLOCK(YP1, YP2, UP, VP, DP1, DP2) \
1070 for (i = 0; i < 32; i += 2) \
1072 for (j = 0; j < 64; j += 2) \
1074 _evas_yuv2rgb_420_raster(YP1, YP2, UP, VP, DP1, DP2); \
1076 /* the previous call just rendered 2 pixels per lines */ \
1077 DP1 += 8; DP2 += 8; \
1079 /* and took for that 2 lines with 2 Y, 1 U and 1 V. Don't forget U & V are in the same plane */ \
1080 YP1 += 2; YP2 += 2; UP += 2; VP += 2; \
1083 DP1 += sizeof (int) * ((w << 1) - 64); \
1084 DP2 += sizeof (int) * ((w << 1) - 64); \
1090 /* One macro block is 32 lines of Y and 16 lines of UV */
1091 const int offset_value[2] = { 0, 64 * 16 };
1092 int mb_x, mb_y, mb_w, mb_h;
1094 int uv_x, uv_y, uv_step;
1097 /* Idea iterate over each macroblock and convert each of them using _evas_nv12torgb_raster */
1099 /* The layout of the Y macroblock order in RGB non tiled space : */
1100 /* --------------------------------------------------- */
1101 /* | 0 | 1 | 6 | 7 | 8 | 9 | 14 | 15 | 16 | 17 | */
1102 /* --------------------------------------------------- */
1103 /* | 2 | 3 | 4 | 5 | 10 | 11 | 12 | 13 | 18 | 19 | */
1104 /* --------------------------------------------------- */
1105 /* | 20 | 21 | 26 | 27 | 28 | 29 | 34 | 35 | 36 | 37 | */
1106 /* --------------------------------------------------- */
1107 /* | 22 | 23 | 24 | 25 | 30 | 31 | 32 | 33 | 38 | 39 | */
1108 /* --------------------------------------------------- */
1109 /* | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | */
1110 /* --------------------------------------------------- */
1111 /* The layout of the UV macroblock order in the same RGB non tiled space : */
1112 /* --------------------------------------------------- */
1113 /* | | | | | | | | | | | */
1114 /* - 0 - 1 - 6 - 7 - 8 - 9 - 14 - 15 - 16 - 17 - */
1115 /* | | | | | | | | | | | */
1116 /* --------------------------------------------------- */
1117 /* | | | | | | | | | | | */
1118 /* - 2 - 3 - 4 - 5 - 10 - 11 - 12 - 13 - 18 - 19 - */
1119 /* | | | | | | | | | | | */
1120 /* --------------------------------------------------- */
1121 /* | | | | | | | | | | | */
1122 /* - 20 - 21 - 22 - 22 - 23 - 24 - 25 - 26 - 27 - 28 - */
1124 /* the number of macroblock should be a multiple of 64x32 */
1128 base_h = (mb_h >> 1) + (mb_h & 0x1);
1129 stride = w * sizeof (int);
1133 /* In this format we linearize macroblock on two line to form a Z and it's invert */
1134 for (mb_y = 0; mb_y < (mb_h >> 1); mb_y++)
1142 ry[0] = mb_y * 2 * 32 * stride;
1143 ry[1] = ry[0] + 32 * stride;
1145 uv_step = (mb_y & 0x1) == 0 ? 4 : 0;
1146 uv_x = (mb_y & 0x1) == 0 ? 0 : 2 * 64 * 32;
1148 for (mb_x = 0; mb_x < mb_w * 2; mb_x++, rmb_x += 64 * 32)
1150 unsigned char *yp1, *yp2, *up, *vp;
1151 unsigned char *dp1, *dp2;
1153 dp1 = rgb + x + ry[offset];
1156 yp1 = yuv[mb_y] + rmb_x;
1159 /* UV plane is two time less bigger in pixel count, but it old two bytes each times */
1160 up = yuv[(mb_y >> 1) + base_h] + uv_x + offset_value[offset];
1163 HANDLE_MACROBLOCK(yp1, yp2, up, vp, dp1, dp2);
1166 if ((step & 0x3) == 0)
1168 offset = 1 - offset;
1169 x -= 64 * sizeof (int);
1174 x += 64 * sizeof (int);
1182 uv_x += 4 * 64 * 32;
1197 for (mb_x = 0; mb_x < mb_w; mb_x++, x++, uv_x++)
1199 unsigned char *yp1, *yp2, *up, *vp;
1200 unsigned char *dp1, *dp2;
1202 dp1 = rgb + (x * 64 + (ry * 32 * w)) * sizeof (int);
1203 dp2 = dp1 + sizeof (int) * w;
1205 yp1 = yuv[mb_y] + mb_x * 64 * 32;
1208 up = yuv[mb_y / 2 + base_h] + uv_x * 64 * 32;
1211 HANDLE_MACROBLOCK(yp1, yp2, up, vp, dp1, dp2);
1218 _evas_nv12torgb_raster(unsigned char **yuv, unsigned char *rgb, int w, int h)
1222 unsigned char *yp1, *yp2, *up, *vp;
1227 dp2 = dp1 + sizeof (int) * w;
1229 for (yy = 0; yy < h; yy++)
1234 up = yuv[h + (yy >> 1)];
1237 for (xx = 0; xx < w; xx += 2)
1239 _evas_yuv2rgb_420_raster(yp1, yp2, up, vp, dp1, dp2);
1241 /* the previous call just rendered 2 pixels per lines */
1244 /* and took for that 2 lines with 2 Y, 1 U and 1 V. Don't forget U & V are in the same plane */
1245 yp1 += 2; yp2 += 2; up += 2; vp += 2;
1249 dp1 += sizeof (int) * w;
1250 dp2 += sizeof (int) * w;