2 * Copyright (C) <1999> Erik Walthinsen <omega@cse.ogi.edu>
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Library General Public
6 * License as published by the Free Software Foundation; either
7 * version 2 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Library General Public License for more details.
14 * You should have received a copy of the GNU Library General Public
15 * License along with this library; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 02111-1307, USA.
35 static int V_r_tab[256];
36 static int V_g_tab[256];
37 static int U_g_tab[256];
38 static int U_b_tab[256];
41 #define CR_BASE (CB_BASE*CB_RANGE)
42 #define LUM_BASE (CR_BASE*CR_RANGE)
44 #define Min(x,y) (((x) < (y)) ? (x) : (y))
45 #define Max(x,y) (((x) > (y)) ? (x) : (y))
47 #define GAMMA_CORRECTION(x) ((int)(pow((x) / 255.0, 1.0 / gammaCorrect) * 255.0))
48 #define CHROMA_CORRECTION256(x) ((x) >= 128 \
49 ? 128 + Min(127, (int)(((x) - 128.0) * chromaCorrect)) \
50 : 128 - Min(128, (int)((128.0 - (x)) * chromaCorrect)))
51 #define CHROMA_CORRECTION128(x) ((x) >= 0 \
52 ? Min(127, (int)(((x) * chromaCorrect))) \
53 : Max(-128, (int)(((x) * chromaCorrect))))
54 #define CHROMA_CORRECTION256D(x) ((x) >= 128 \
55 ? 128.0 + Min(127.0, (((x) - 128.0) * chromaCorrect)) \
56 : 128.0 - Min(128.0, (((128.0 - (x)) * chromaCorrect))))
57 #define CHROMA_CORRECTION128D(x) ((x) >= 0 \
58 ? Min(127.0, ((x) * chromaCorrect)) \
59 : Max(-128.0, ((x) * chromaCorrect)))
62 void gst_colorspace_I420_to_rgb16 (GstColorspace * space, unsigned char *src,
64 void gst_colorspace_I420_to_rgb24 (GstColorspace * space, unsigned char *src,
66 void gst_colorspace_I420_to_rgb32 (GstColorspace * space, unsigned char *src,
69 void gst_colorspace_I420_to_bgr16_mmx (GstColorspace * space,
70 unsigned char *src, unsigned char *dest);
71 void gst_colorspace_I420_to_bgr32_mmx (GstColorspace * space,
72 unsigned char *src, unsigned char *dest);
75 void gst_colorspace_YV12_to_rgb16 (GstColorspace * space, unsigned char *src,
77 void gst_colorspace_YV12_to_rgb24 (GstColorspace * space, unsigned char *src,
79 void gst_colorspace_YV12_to_rgb32 (GstColorspace * space, unsigned char *src,
82 void gst_colorspace_YV12_to_bgr16_mmx (GstColorspace * space,
83 unsigned char *src, unsigned char *dest);
84 void gst_colorspace_YV12_to_bgr32_mmx (GstColorspace * space,
85 unsigned char *src, unsigned char *dest);
89 gst_colorspace_yuv_to_rgb16 (GstColorspace * space,
92 unsigned char *cr, unsigned char *cb, int cols, int rows);
94 gst_colorspace_yuv_to_rgb24 (GstColorspace * space,
97 unsigned char *cr, unsigned char *cb, int cols, int rows);
99 gst_colorspace_yuv_to_rgb32 (GstColorspace * space,
102 unsigned char *cr, unsigned char *cb, int cols, int rows);
104 static void gst_colorspace_yuv_to_rgb16 (GstColorspaceYUVTables * tables,
107 unsigned char *cb, unsigned char *out, int cols, int rows);
108 static void gst_colorspace_yuv_to_rgb24 (GstColorspaceYUVTables * tables,
111 unsigned char *cb, unsigned char *out, int cols, int rows);
112 static void gst_colorspace_yuv_to_rgb32 (GstColorspaceYUVTables * tables,
115 unsigned char *cb, unsigned char *out, int cols, int rows);
117 void gst_colorspace_yuv_to_bgr32_mmx (GstColorspaceYUVTables * tables,
120 unsigned char *cb, unsigned char *out, int cols, int rows);
121 extern void gst_colorspace_yuv_to_bgr16_mmx (GstColorspaceYUVTables * tables,
124 unsigned char *cb, unsigned char *out, int cols, int rows);
128 #define ROUND_UP_2(x) (((x)+1)&~1)
129 #define ROUND_UP_4(x) (((x)+3)&~3)
130 #define ROUND_UP_8(x) (((x)+7)&~7)
134 gst_colorspace_I420_to_rgb32 (GstColorspace * space, unsigned char *dest,
137 unsigned char *src_U;
138 unsigned char *src_V;
140 src_U = src + ROUND_UP_4 (space->width) * ROUND_UP_2 (space->height);
142 src_U + ROUND_UP_8 (space->width) / 2 * ROUND_UP_2 (space->height) / 2;
144 gst_colorspace_yuv_to_rgb32 (space,
145 dest, src, src_U, src_V, space->width, space->height);
149 gst_colorspace_I420_to_rgb24 (GstColorspace * space, unsigned char *dest,
152 unsigned char *src_U;
153 unsigned char *src_V;
155 src_U = src + ROUND_UP_4 (space->width) * ROUND_UP_2 (space->height);
157 src_U + ROUND_UP_8 (space->width) / 2 * ROUND_UP_2 (space->height) / 2;
159 gst_colorspace_yuv_to_rgb24 (space,
160 dest, src, src_U, src_V, space->width, space->height);
164 gst_colorspace_I420_to_rgb16 (GstColorspace * space, unsigned char *dest,
167 unsigned char *src_U;
168 unsigned char *src_V;
170 src_U = src + ROUND_UP_4 (space->width) * ROUND_UP_2 (space->height);
172 src_U + ROUND_UP_8 (space->width) / 2 * ROUND_UP_2 (space->height) / 2;
174 gst_colorspace_yuv_to_rgb16 (space,
175 dest, src, src_U, src_V, space->width, space->height);
179 gst_colorspace_YV12_to_rgb32 (GstColorspace * space, unsigned char *dest,
182 unsigned char *src_U;
183 unsigned char *src_V;
185 src_V = src + ROUND_UP_4 (space->width) * ROUND_UP_2 (space->height);
187 src_V + ROUND_UP_8 (space->width) / 2 * ROUND_UP_2 (space->height) / 2;
189 gst_colorspace_yuv_to_rgb32 (space,
190 dest, src, src_U, src_V, space->width, space->height);
194 gst_colorspace_YV12_to_rgb24 (GstColorspace * space, unsigned char *dest,
197 unsigned char *src_U;
198 unsigned char *src_V;
200 src_V = src + ROUND_UP_4 (space->width) * ROUND_UP_2 (space->height);
202 src_V + ROUND_UP_8 (space->width) / 2 * ROUND_UP_2 (space->height) / 2;
204 gst_colorspace_yuv_to_rgb24 (space,
205 dest, src, src_U, src_V, space->width, space->height);
209 gst_colorspace_YV12_to_rgb16 (GstColorspace * space, unsigned char *dest,
212 unsigned char *src_U;
213 unsigned char *src_V;
215 src_V = src + ROUND_UP_4 (space->width) * ROUND_UP_2 (space->height);
217 src_V + ROUND_UP_8 (space->width) / 2 * ROUND_UP_2 (space->height) / 2;
219 gst_colorspace_yuv_to_rgb16 (space,
220 dest, src, src_U, src_V, space->width, space->height);
225 gst_colorspace_I420_to_bgr32_mmx (GstColorspace * space, unsigned char *src,
230 GST_DEBUG ("gst_colorspace_I420_to_rgb32_mmx");
232 size = space->width * space->height;
234 gst_colorspace_yuv_to_bgr32_mmx (NULL, src, /* Y component */
235 src + size, /* cr component */
236 src + size + (size >> 2), /* cb component */
237 dest, space->height, space->width);
242 gst_colorspace_I420_to_bgr16_mmx (GstColorspace * space, unsigned char *src,
247 GST_DEBUG ("gst_colorspace_I420_to_bgr16_mmx ");
249 size = space->width * space->height;
251 gst_colorspace_yuv_to_bgr16_mmx (NULL, src, /* Y component */
252 src + size, /* cr component */
253 src + size + (size >> 2), /* cb component */
254 dest, space->height, space->width);
255 GST_DEBUG ("gst_colorspace_I420_to_bgr16_mmx done");
260 gst_colorspace_YV12_to_bgr32_mmx (GstColorspace * space, unsigned char *src,
265 GST_DEBUG ("gst_colorspace_YV12_to_rgb32_mmx");
267 size = space->width * space->height;
269 gst_colorspace_yuv_to_bgr32_mmx (NULL, src, /* Y component */
270 src + size + (size >> 2), /* cb component */
271 src + size, /* cr component */
272 dest, space->height, space->width);
277 gst_colorspace_YV12_to_bgr16_mmx (GstColorspace * space, unsigned char *src,
282 GST_DEBUG ("gst_colorspace_YV12_to_bgr16_mmx ");
284 size = space->width * space->height;
286 gst_colorspace_yuv_to_bgr16_mmx (NULL, src, /* Y component */
287 src + size + (size >> 2), /* cb component */
288 src + size, /* cr component */
289 dest, space->height, space->width);
290 GST_DEBUG ("gst_colorspace_YV12_to_bgr16_mmx done");
295 * How many 1 bits are there in the longword.
296 * Low performance, do not call often.
300 number_of_bits_set (a)
306 return 1 + number_of_bits_set (a >> 1);
307 return (number_of_bits_set (a >> 1));
311 * How many 0 bits are there at most significant end of longword.
312 * Low performance, do not call often.
318 /* assume char is 8 bits */
320 return sizeof (unsigned long) * 8;
321 /* assume twos complement */
324 return 1 + free_bits_at_top (a << 1);
328 * How many 0 bits are there at least significant end of longword.
329 * Low performance, do not call often.
332 free_bits_at_bottom (a)
335 /* assume char is 8 bits */
337 return sizeof (unsigned long) * 8;
340 return 1 + free_bits_at_bottom (a >> 1);
344 *--------------------------------------------------------------
346 * InitColor16Dither --
348 * To get rid of the multiply and other conversions in color
349 * dither, we use a lookup table.
355 * The lookup tables are initialized.
357 *--------------------------------------------------------------
361 gst_colorspace_table_init (GstColorspace * space)
365 for (i = 0; i < 256; i++) {
366 V_r_tab[i] = (0.419 / 0.299) * (i - 128);
367 V_g_tab[i] = -(0.299 / 0.419) * (i - 128);
368 U_g_tab[i] = -(0.114 / 0.331) * (i - 128);
369 U_b_tab[i] = (0.587 / 0.331) * (i - 128);
373 int *L_tab, *Cr_r_tab, *Cr_g_tab, *Cb_g_tab, *Cb_b_tab;
378 long red_mask = 0xff0000;
379 long green_mask = 0x00ff00;
380 long blue_mask = 0x0000ff;
382 L_tab = space->L_tab = (int *) malloc (256 * sizeof (int));
383 Cr_r_tab = space->Cr_r_tab = (int *) malloc (256 * sizeof (int));
384 Cr_g_tab = space->Cr_g_tab = (int *) malloc (256 * sizeof (int));
385 Cb_g_tab = space->Cb_g_tab = (int *) malloc (256 * sizeof (int));
386 Cb_b_tab = space->Cb_b_tab = (int *) malloc (256 * sizeof (int));
388 r_2_pix_alloc = (long *) malloc (768 * sizeof (long));
389 g_2_pix_alloc = (long *) malloc (768 * sizeof (long));
390 b_2_pix_alloc = (long *) malloc (768 * sizeof (long));
397 r_2_pix_alloc == NULL || g_2_pix_alloc == NULL || b_2_pix_alloc == NULL) {
398 fprintf (stderr, "Could not get enough memory in InitColorDither\n");
402 for (i = 0; i < 256; i++) {
405 if (gammaCorrectFlag) {
406 L_tab[i] = GAMMA_CORRECTION(i);
412 if (chromaCorrectFlag) {
414 CB = CHROMA_CORRECTION128(CB);
416 CR = CHROMA_CORRECTION128(CR);
424 Cr_r_tab[i] = (0.419 / 0.299) * CR;
425 Cr_g_tab[i] = -(0.299 / 0.419) * CR;
426 Cb_g_tab[i] = -(0.114 / 0.331) * CB;
427 Cb_b_tab[i] = (0.587 / 0.331) * CB;
432 * Set up entries 0-255 in rgb-to-pixel value tables.
434 for (i = 0; i < 256; i++) {
435 r_2_pix_alloc[i + 256] = i >> (8 - number_of_bits_set (red_mask));
436 r_2_pix_alloc[i + 256] <<= free_bits_at_bottom (red_mask);
437 g_2_pix_alloc[i + 256] = i >> (8 - number_of_bits_set (green_mask));
438 g_2_pix_alloc[i + 256] <<= free_bits_at_bottom (green_mask);
439 b_2_pix_alloc[i + 256] = i >> (8 - number_of_bits_set (blue_mask));
440 b_2_pix_alloc[i + 256] <<= free_bits_at_bottom (blue_mask);
442 * If we have 16-bit output depth, then we double the value
443 * in the top word. This means that we can write out both
444 * pixels in the pixel doubling mode with one op. It is
445 * harmless in the normal case as storing a 32-bit value
446 * through a short pointer will lose the top bits anyway.
447 * A similar optimisation for Alpha for 64 bit has been
448 * prepared for, but is not yet implemented.
450 if (!(depth == 32) && !(depth == 24)) {
452 r_2_pix_alloc[i + 256] |= (r_2_pix_alloc[i + 256]) << 16;
453 g_2_pix_alloc[i + 256] |= (g_2_pix_alloc[i + 256]) << 16;
454 b_2_pix_alloc[i + 256] |= (b_2_pix_alloc[i + 256]) << 16;
460 r_2_pix_alloc[i + 256] |= (r_2_pix_alloc[i + 256]) << 32;
461 g_2_pix_alloc[i + 256] |= (g_2_pix_alloc[i + 256]) << 32;
462 b_2_pix_alloc[i + 256] |= (b_2_pix_alloc[i + 256]) << 32;
469 * Spread out the values we have to the rest of the array so that
470 * we do not need to check for overflow.
472 for (i = 0; i < 256; i++) {
473 r_2_pix_alloc[i] = r_2_pix_alloc[256];
474 r_2_pix_alloc[i + 512] = r_2_pix_alloc[511];
475 g_2_pix_alloc[i] = g_2_pix_alloc[256];
476 g_2_pix_alloc[i + 512] = g_2_pix_alloc[511];
477 b_2_pix_alloc[i] = b_2_pix_alloc[256];
478 b_2_pix_alloc[i + 512] = b_2_pix_alloc[511];
481 space->r_2_pix = r_2_pix_alloc + 256;
482 space->g_2_pix = g_2_pix_alloc + 256;
483 space->b_2_pix = b_2_pix_alloc + 256;
488 gst_colorspace_yuv_to_rgb32 (GstColorspace * space,
490 unsigned char *Y, unsigned char *U, unsigned char *V, int width, int height)
496 src_rowstride = ROUND_UP_4 (space->width);
497 dest_rowstride = width * 4;
498 for (y = 0; y < height; y++) {
499 for (x = 0; x < width; x++) {
501 dest[x * 3 + 1] = CLAMP (Y[x] + V_r_tab[V[x / 2]], 0, 255);
503 CLAMP (Y[x] + U_g_tab[U[x / 2]] + V_g_tab[V[x / 2]], 0, 255);
504 dest[x * 3 + 3] = CLAMP (Y[x] + U_b_tab[U[x / 2]], 0, 255);
507 dest += dest_rowstride;
509 U += src_rowstride / 2;
510 V += src_rowstride / 2;
516 gst_colorspace_yuv_to_rgb24 (GstColorspace * space,
518 unsigned char *Y, unsigned char *U, unsigned char *V, int width, int height)
524 src_rowstride = ROUND_UP_4 (space->width);
525 dest_rowstride = ROUND_UP_4 (width * 3);
526 for (y = 0; y < height; y++) {
527 for (x = 0; x < width; x++) {
528 dest[x * 3 + 0] = CLAMP (Y[x] + V_r_tab[V[x / 2]], 0, 255);
530 CLAMP (Y[x] + U_g_tab[U[x / 2]] + V_g_tab[V[x / 2]], 0, 255);
531 dest[x * 3 + 2] = CLAMP (Y[x] + U_b_tab[U[x / 2]], 0, 255);
534 dest += dest_rowstride;
536 U += src_rowstride / 2;
537 V += src_rowstride / 2;
543 gst_colorspace_yuv_to_rgb16 (GstColorspace * space,
545 unsigned char *Y, unsigned char *U, unsigned char *V, int width, int height)
552 src_rowstride = ROUND_UP_4 (space->width);
553 dest_rowstride = ROUND_UP_4 (width * 2);
554 for (y = 0; y < height; y++) {
555 for (x = 0; x < width; x++) {
556 r = CLAMP (Y[x] + V_r_tab[V[x / 2]], 0, 255);
557 g = CLAMP (Y[x] + U_g_tab[U[x / 2]] + V_g_tab[V[x / 2]], 0, 255);
558 b = CLAMP (Y[x] + U_b_tab[U[x / 2]], 0, 255);
559 *(unsigned short *) (dest + x * 2) =
560 ((r & 0xf8) << 8) | ((g & 0xfc) << 3) | (b >> 3);
563 dest += dest_rowstride;
565 U += src_rowstride / 2;
566 V += src_rowstride / 2;
572 static mmx_t MMX_80w = (mmx_t) (long long) 0x0080008000800080LL; /*dd 00080 0080h, 000800080h */
574 static mmx_t MMX_00FFw = (mmx_t) (long long) 0x00ff00ff00ff00ffLL; /*dd 000FF 00FFh, 000FF00FFh */
575 static mmx_t MMX_FF00w = (mmx_t) (long long) 0xff00ff00ff00ff00LL; /*dd 000FF 00FFh, 000FF00FFh */
577 static mmx_t MMX32_Vredcoeff = (mmx_t) (long long) 0x0059005900590059LL;
578 static mmx_t MMX32_Ubluecoeff = (mmx_t) (long long) 0x0072007200720072LL;
579 static mmx_t MMX32_Ugrncoeff = (mmx_t) (long long) 0xffeaffeaffeaffeaLL;
580 static mmx_t MMX32_Vgrncoeff = (mmx_t) (long long) 0xffd2ffd2ffd2ffd2LL;
583 gst_colorspace_yuv_to_bgr32_mmx (tables, lum, cr, cb, out, rows, cols)
584 GstColorspaceYUVTables *tables;
592 guint32 *row1 = (guint32 *) out; /* 32 bit target */
593 int cols4 = cols >> 2;
597 for (y = rows >> 1; y; y--) {
598 for (x = cols4; x; x--) {
600 /* create Cr (result in mm1) */
601 movd_m2r (*(mmx_t *) cb, mm1); /* 0 0 0 0 v3 v2 v1 v0 */
602 pxor_r2r (mm7, mm7); /* 00 00 00 00 00 00 00 00 */
603 movd_m2r (*(mmx_t *) lum, mm2); /* 0 0 0 0 l3 l2 l1 l0 */
604 punpcklbw_r2r (mm7, mm1); /* 0 v3 0 v2 00 v1 00 v0 */
605 punpckldq_r2r (mm1, mm1); /* 00 v1 00 v0 00 v1 00 v0 */
606 psubw_m2r (MMX_80w, mm1); /* mm1-128:r1 r1 r0 r0 r1 r1 r0 r0 */
608 /* create Cr_g (result in mm0) */
609 movq_r2r (mm1, mm0); /* r1 r1 r0 r0 r1 r1 r0 r0 */
610 pmullw_m2r (MMX32_Vgrncoeff, mm0); /* red*-46dec=0.7136*64 */
611 pmullw_m2r (MMX32_Vredcoeff, mm1); /* red*89dec=1.4013*64 */
612 psraw_i2r (6, mm0); /* red=red/64 */
613 psraw_i2r (6, mm1); /* red=red/64 */
615 /* create L1 L2 (result in mm2,mm4) */
617 movq_m2r (*(mmx_t *) (lum + cols), mm3); /* 0 0 0 0 L3 L2 L1 L0 */
618 punpckldq_r2r (mm3, mm2); /* L3 L2 L1 L0 l3 l2 l1 l0 */
619 movq_r2r (mm2, mm4); /* L3 L2 L1 L0 l3 l2 l1 l0 */
620 pand_m2r (MMX_FF00w, mm2); /* L3 0 L1 0 l3 0 l1 0 */
621 pand_m2r (MMX_00FFw, mm4); /* 0 L2 0 L0 0 l2 0 l0 */
622 psrlw_i2r (8, mm2); /* 0 L3 0 L1 0 l3 0 l1 */
624 /* create R (result in mm6) */
625 movq_r2r (mm2, mm5); /* 0 L3 0 L1 0 l3 0 l1 */
626 movq_r2r (mm4, mm6); /* 0 L2 0 L0 0 l2 0 l0 */
627 paddsw_r2r (mm1, mm5); /* lum1+red:x R3 x R1 x r3 x r1 */
628 paddsw_r2r (mm1, mm6); /* lum1+red:x R2 x R0 x r2 x r0 */
629 packuswb_r2r (mm5, mm5); /* R3 R1 r3 r1 R3 R1 r3 r1 */
630 packuswb_r2r (mm6, mm6); /* R2 R0 r2 r0 R2 R0 r2 r0 */
631 pxor_r2r (mm7, mm7); /* 00 00 00 00 00 00 00 00 */
632 punpcklbw_r2r (mm5, mm6); /* R3 R2 R1 R0 r3 r2 r1 r0 */
634 /* create Cb (result in mm1) */
635 movd_m2r (*(mmx_t *) cr, mm1); /* 0 0 0 0 u3 u2 u1 u0 */
636 punpcklbw_r2r (mm7, mm1); /* 0 u3 0 u2 00 u1 00 u0 */
637 punpckldq_r2r (mm1, mm1); /* 00 u1 00 u0 00 u1 00 u0 */
638 psubw_m2r (MMX_80w, mm1); /* mm1-128:u1 u1 u0 u0 u1 u1 u0 u0 */
639 /* create Cb_g (result in mm5) */
640 movq_r2r (mm1, mm5); /* u1 u1 u0 u0 u1 u1 u0 u0 */
641 pmullw_m2r (MMX32_Ugrncoeff, mm5); /* blue*-109dec=1.7129*64 */
642 pmullw_m2r (MMX32_Ubluecoeff, mm1); /* blue*114dec=1.78125*64 */
643 psraw_i2r (6, mm5); /* blue=red/64 */
644 psraw_i2r (6, mm1); /* blue=blue/64 */
646 /* create G (result in mm7) */
647 movq_r2r (mm2, mm3); /* 0 L3 0 L1 0 l3 0 l1 */
648 movq_r2r (mm4, mm7); /* 0 L2 0 L0 0 l2 0 l1 */
649 paddsw_r2r (mm5, mm3); /* lum1+Cb_g:x G3t x G1t x g3t x g1t */
650 paddsw_r2r (mm5, mm7); /* lum1+Cb_g:x G2t x G0t x g2t x g0t */
651 paddsw_r2r (mm0, mm3); /* lum1+Cr_g:x G3 x G1 x g3 x g1 */
652 paddsw_r2r (mm0, mm7); /* lum1+blue:x G2 x G0 x g2 x g0 */
653 packuswb_r2r (mm3, mm3); /* G3 G1 g3 g1 G3 G1 g3 g1 */
654 packuswb_r2r (mm7, mm7); /* G2 G0 g2 g0 G2 G0 g2 g0 */
655 punpcklbw_r2r (mm3, mm7); /* G3 G2 G1 G0 g3 g2 g1 g0 */
657 /* create B (result in mm5) */
658 movq_r2r (mm2, mm3); /* 0 L3 0 L1 0 l3 0 l1 */
659 movq_r2r (mm4, mm5); /* 0 L2 0 L0 0 l2 0 l1 */
660 paddsw_r2r (mm1, mm3); /* lum1+blue:x B3 x B1 x b3 x b1 */
661 paddsw_r2r (mm1, mm5); /* lum1+blue:x B2 x B0 x b2 x b0 */
662 packuswb_r2r (mm3, mm3); /* B3 B1 b3 b1 B3 B1 b3 b1 */
663 packuswb_r2r (mm5, mm5); /* B2 B0 b2 b0 B2 B0 b2 b0 */
664 punpcklbw_r2r (mm3, mm5); /* B3 B2 B1 B0 b3 b2 b1 b0 */
666 /* fill destination row1 (needed are mm6=Rr,mm7=Gg,mm5=Bb) */
668 pxor_r2r (mm2, mm2); /* 0 0 0 0 0 0 0 0 */
669 pxor_r2r (mm4, mm4); /* 0 0 0 0 0 0 0 0 */
670 movq_r2r (mm6, mm1); /* R3 R2 R1 R0 r3 r2 r1 r0 */
671 movq_r2r (mm5, mm3); /* B3 B2 B1 B0 b3 b2 b1 b0 */
672 /* process lower lum */
673 punpcklbw_r2r (mm4, mm1); /* 0 r3 0 r2 0 r1 0 r0 */
674 punpcklbw_r2r (mm4, mm3); /* 0 b3 0 b2 0 b1 0 b0 */
675 movq_r2r (mm1, mm2); /* 0 r3 0 r2 0 r1 0 r0 */
676 movq_r2r (mm3, mm0); /* 0 b3 0 b2 0 b1 0 b0 */
677 punpcklwd_r2r (mm1, mm3); /* 0 r1 0 b1 0 r0 0 b0 */
678 punpckhwd_r2r (mm2, mm0); /* 0 r3 0 b3 0 r2 0 b2 */
680 pxor_r2r (mm2, mm2); /* 0 0 0 0 0 0 0 0 */
681 movq_r2r (mm7, mm1); /* G3 G2 G1 G0 g3 g2 g1 g0 */
682 punpcklbw_r2r (mm1, mm2); /* g3 0 g2 0 g1 0 g0 0 */
683 punpcklwd_r2r (mm4, mm2); /* 0 0 g1 0 0 0 g0 0 */
684 por_r2r (mm3, mm2); /* 0 r1 g1 b1 0 r0 g0 b0 */
685 movq_r2m (mm2, *(mmx_t *) row1); /* wrote out ! row1 */
687 pxor_r2r (mm2, mm2); /* 0 0 0 0 0 0 0 0 */
688 punpcklbw_r2r (mm1, mm4); /* g3 0 g2 0 g1 0 g0 0 */
689 punpckhwd_r2r (mm2, mm4); /* 0 0 g3 0 0 0 g2 0 */
690 por_r2r (mm0, mm4); /* 0 r3 g3 b3 0 r2 g2 b2 */
691 movq_r2m (mm4, *(mmx_t *) (row1 + 2)); /* wrote out ! row1 */
693 /* fill destination row2 (needed are mm6=Rr,mm7=Gg,mm5=Bb) */
694 /* this can be done "destructive" */
695 pxor_r2r (mm2, mm2); /* 0 0 0 0 0 0 0 0 */
696 punpckhbw_r2r (mm2, mm6); /* 0 R3 0 R2 0 R1 0 R0 */
697 punpckhbw_r2r (mm1, mm5); /* G3 B3 G2 B2 G1 B1 G0 B0 */
698 movq_r2r (mm5, mm1); /* G3 B3 G2 B2 G1 B1 G0 B0 */
699 punpcklwd_r2r (mm6, mm1); /* 0 R1 G1 B1 0 R0 G0 B0 */
700 movq_r2m (mm1, *(mmx_t *) (row1 + cols)); /* wrote out ! row2 */
701 punpckhwd_r2r (mm6, mm5); /* 0 R3 G3 B3 0 R2 G2 B2 */
702 movq_r2m (mm5, *(mmx_t *) (row1 + cols + 2)); /* wrote out ! row2 */