From c61d6ae39e5039dcb27bf95334a86520b562bbc5 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Jos=C3=A9=20Fonseca?= <jrfonseca@tungstengraphics.com>
Date: Fri, 7 Sep 2007 10:52:01 +0100
Subject: [PATCH] MMX code path for YV12 copy, taken from the xserver glucose-2
 branch.

---
 pixman/pixman-mmx.c  | 1004 ++++++++++++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-mmx.h  |   28 ++
 pixman/pixman-pict.c |   30 +-
 3 files changed, 1061 insertions(+), 1 deletion(-)

diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 8c7be6d..4fb2579 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -30,6 +30,9 @@
  */
 #include <config.h>
 
+#include <stdlib.h>
+#include <limits.h>
+
 #ifdef USE_MMX
 
 #if defined(__amd64__) || defined(__x86_64__)
@@ -917,6 +920,188 @@ void fbComposeSetupMMX(void)
     } 
 }
 
+static __inline__ uint8_t
+interpolate_bilinear (int   distx,
+		      int   idistx,
+		      int   disty,
+		      int   idisty,
+		      uint8_t tl,
+		      uint8_t tr,
+		      uint8_t bl,
+		      uint8_t br)
+{
+    return ((tl * idistx + tr * distx) * idisty +
+	    (bl * idistx + br * distx) * disty) >> 16;
+}
+
+static __inline__ uint32_t
+interpolate_bilinear_8888 (int   distx,
+			   int   idistx,
+			   int   disty,
+			   int   idisty,
+			   uint8_t *l00,
+			   uint8_t *l01,
+			   uint8_t *l10,
+			   uint8_t *l11,
+			   int   x00,
+			   int   x01,
+			   int   x10,
+			   int   x11)
+{
+    uint8_t buffer[4];
+
+    buffer[0] = interpolate_bilinear (distx, idistx, disty, idisty,
+				      l00[x00], l01[x01],
+				      l10[x10], l11[x11]);
+
+    buffer[1] = interpolate_bilinear (distx, idistx, disty, idisty,
+				      l00[x00 + 1], l01[x01 + 1],
+				      l10[x10 + 1], l11[x11 + 1]);
+
+    buffer[2] = interpolate_bilinear (distx, idistx, disty, idisty,
+				      l00[x00 + 2], l01[x01 + 2],
+				      l10[x10 + 2], l11[x11 + 2]);
+
+    buffer[3] = interpolate_bilinear (distx, idistx, disty, idisty,
+				      l00[x00 + 3], l01[x01 + 3],
+				      l10[x10 + 3], l11[x11 + 3]);
+
+    return *((uint32_t *) buffer);
+}
+
+static __inline__ uint32_t
+fetch_bilinear2_8888 (int   distx,
+		      int   idistx,
+		      int   disty,
+		      int   idisty,
+		      uint8_t *l0,
+		      uint8_t *l1,
+		      int   x0,
+		      int   x1)
+{
+    return interpolate_bilinear_8888 (distx,
+				      idistx,
+				      disty,
+				      idisty,
+				      l0,
+				      l0,
+				      l1,
+				      l1,
+				      x0,
+				      x0 + 4,
+				      x1,
+				      x1 + 4);
+}
+
+static __inline__ uint32_t
+fetch_bilinear_8888 (int   distx,
+		     int   idistx,
+		     int   disty,
+		     int   idisty,
+		     uint8_t *l0,
+		     uint8_t *l1,
+		     int   x)
+{
+    return fetch_bilinear2_8888 (distx, idistx, disty, idisty, l0, l1, x, x);
+}
+
+static uint32_t _zero32x2[2] = { 0x0, 0x0 };
+static uint8_t  *_zero8x8 = (uint8_t *) _zero32x2;
+
+static __inline__ int
+set_scale_steps (uint32_t   *src,
+		 int srcStride,
+		 int	  xStart,
+		 int	  xStep,
+		 int	  width,
+		 int	  line,
+		 int	  lastLine,
+		 pixman_repeat_t repeatType,
+		 uint8_t	  **s0,
+		 uint8_t	  **s1,
+		 int	  *x0,
+		 int	  *x0Step,
+		 int	  *x1,
+		 int	  *x1Step)
+{
+    if (line < 0)
+    {
+	if (repeatType == PIXMAN_REPEAT_PAD)
+	{
+	    *s0 = (uint8_t *) src;
+	    *s1 = (uint8_t *) src;
+
+	    *x0     = xStart;
+	    *x0Step = xStep;
+	    *x1     = xStart;
+	    *x1Step = xStep;
+	}
+	else
+	{
+	    if (line == -1)
+	    {
+		*s0 = _zero8x8;
+
+		*x0     = 0;
+		*x0Step = 0;
+
+		*s1 = (uint8_t *) src;
+
+		*x1     = xStart;
+		*x1Step = xStep;
+	    }
+	    else
+	    {
+		return 0;
+	    }
+	}
+    }
+    else if (line >= lastLine)
+    {
+	if (repeatType == PIXMAN_REPEAT_PAD)
+	{
+	    *s0 = (uint8_t *) (src + srcStride * lastLine);
+	    *s1 = (uint8_t *) (src + srcStride * lastLine);
+
+	    *x0     = xStart;
+	    *x0Step = xStep;
+	    *x1     = xStart;
+	    *x1Step = xStep;
+	}
+	else
+	{
+	    if (line == lastLine)
+	    {
+		*s0 = (uint8_t *) (src + srcStride * line);
+
+		*x0     = xStart;
+		*x0Step = xStep;
+
+		*s1 = _zero8x8;
+
+		*x1     = 0;
+		*x1Step = 0;
+	    }
+	    else
+	    {
+		return 0;
+	    }
+	}
+    }
+    else
+    {
+	*s0 = (uint8_t *) (src + srcStride * line);
+	*s1 = (uint8_t *) (src + srcStride * (line + 1));
+
+	*x0     = xStart;
+	*x0Step = xStep;
+	*x1     = xStart;
+	*x1Step = xStep;
+    }
+
+    return width;
+}
+
 
 /* ------------------ MMX code paths called from fbpict.c ----------------------- */
 
@@ -2970,6 +3155,825 @@ fbCompositeOver_x888x8x8888mmx (pixman_op_t      op,
     _mm_empty();
 }
 
+typedef struct _ScanlineBuf {
+    pixman_bool_t lock[2];
+    int    y[2];
+    uint8_t *line[2];
+    int   height;
+    uint8_t *heap;
+} ScanlineBuf;
+
+static pixman_bool_t
+init_scanline_buffer (ScanlineBuf *slb,
+		      uint8_t	  *buffer,
+		      int	  size,
+		      int	  length,
+		      int	  height)
+{
+    int i, s;
+
+    s = length << 1;
+
+    if (size < s)
+    {
+	slb->heap = malloc (s);
+	if (!slb->heap)
+	    return FALSE;
+
+	buffer = slb->heap;
+    }
+    else
+    {
+	slb->heap = NULL;
+    }
+
+    for (i = 0; i < 2; i++)
+    {
+	slb->lock[i] = FALSE;
+	slb->y[i]    = SHRT_MAX;
+	slb->line[i] = buffer;
+
+	buffer += length;
+    }
+
+    slb->height = height;
+
+    return TRUE;
+}
+
+static void
+fini_scanline_buffer (ScanlineBuf *slb)
+{
+    if (slb->heap)
+	free (slb->heap);
+}
+
+static __inline__ void
+release_scanlines (ScanlineBuf *slb)
+{
+    int i;
+
+    for (i = 0; i < 2; i++)
+	slb->lock[i] = FALSE;
+}
+
+static __inline__ int
+_y_to_scanline (ScanlineBuf *slb,
+		int	    y)
+{
+    return (y < 0) ? 0 : (y >= slb->height) ? slb->height - 1 : y;
+}
+
+static __inline__ uint8_t *
+get_scanline (ScanlineBuf *slb,
+	      int	  y)
+{
+    int i;
+
+    y = _y_to_scanline (slb, y);
+
+    for (i = 0; i < 2; i++)
+    {
+	if (slb->y[i] == y)
+	{
+	    slb->lock[i] = TRUE;
+	    return slb->line[i];
+	}
+    }
+
+    return NULL;
+}
+
+typedef struct {
+    ullong subYw;
+    ullong U_green;
+    ullong U_blue;
+    ullong V_red;
+    ullong V_green;
+    ullong Y_coeff;
+    ullong mmx0080;
+    ullong mmx00ff;
+} YUVData;
+
+static const YUVData yuv = {
+    .subYw   = 0x1010101010101010ULL,
+    .U_green = 0xf377f377f377f377ULL,
+    .U_blue  = 0x408d408d408d408dULL,
+    .V_red   = 0x3313331333133313ULL,
+    .V_green = 0xe5fce5fce5fce5fcULL,
+    .Y_coeff = 0x2543254325432543ULL,
+    .mmx0080 = 0x0080008000800080ULL,
+    .mmx00ff = 0x00ff00ff00ff00ffULL
+};
+
+static __inline__ void
+mmx_loadyv12 (uint8_t *py,
+	      uint8_t *pu,
+	      uint8_t *pv)
+{
+    __asm__ __volatile__ (
+	"movq      %0,    %%mm6\n" /* mm6 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
+	"pxor      %%mm4, %%mm4\n" /* mm4 = 0                       */
+	"psubusb   %1,    %%mm6\n" /* Y -= 16                       */
+	"movd      %2,    %%mm0\n" /* mm0 = 00 00 00 00 U3 U2 U1 U0 */
+	"movq      %%mm6, %%mm7\n" /* mm7 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
+	"pand      %3,    %%mm6\n" /* mm6 =    Y6    Y4    Y2    Y0 */
+	"psrlw     %4,    %%mm7\n" /* mm7 =    Y7    Y5    Y3    Y1 */
+	"movd      %5,    %%mm1\n" /* mm1 = 00 00 00 00 V3 V2 V1 V0 */
+	"psllw     %6,    %%mm6\n" /* promote precision             */
+	"pmulhw    %7,    %%mm6\n" /* mm6 = luma_rgb even           */
+	"psllw     %8,    %%mm7\n" /* promote precision             */
+	"punpcklbw %%mm4, %%mm0\n" /* mm0 = U3 U2 U1 U0             */
+	"psubsw    %9,    %%mm0\n" /* U -= 128                      */
+	"punpcklbw %%mm4, %%mm1\n" /* mm1 = V3 V2 V1 V0             */
+	"pmulhw    %10,   %%mm7\n" /* mm7 = luma_rgb odd            */
+	"psllw     %11,   %%mm0\n" /* promote precision             */
+	"psubsw    %12,   %%mm1\n" /* V -= 128                      */
+	"movq      %%mm0, %%mm2\n" /* mm2 = U3 U2 U1 U0             */
+	"psllw     %13,   %%mm1\n" /* promote precision             */
+	"movq      %%mm1, %%mm4\n" /* mm4 = V3 V2 V1 V0             */
+	"pmulhw    %14,   %%mm0\n" /* mm0 = chroma_b                */
+	"pmulhw    %15,   %%mm1\n" /* mm1 = chroma_r                */
+	"movq      %%mm0, %%mm3\n" /* mm3 = chroma_b                */
+	"paddsw    %%mm6, %%mm0\n" /* mm0 = B6 B4 B2 B0             */
+	"paddsw    %%mm7, %%mm3\n" /* mm3 = B7 B5 B3 B1             */
+	"packuswb  %%mm0, %%mm0\n" /* saturate to 0-255             */
+	"pmulhw    %16,   %%mm2\n" /* mm2 = U * U_green             */
+	"packuswb  %%mm3, %%mm3\n" /* saturate to 0-255             */
+	"punpcklbw %%mm3, %%mm0\n" /* mm0 = B7 B6 B5 B4 B3 B2 B1 B0 */
+	"pmulhw    %17,   %%mm4\n" /* mm4 = V * V_green             */
+	"paddsw    %%mm4, %%mm2\n" /* mm2 = chroma_g                */
+	"movq      %%mm2, %%mm5\n" /* mm5 = chroma_g                */
+	"movq      %%mm1, %%mm4\n" /* mm4 = chroma_r                */
+	"paddsw    %%mm6, %%mm2\n" /* mm2 = G6 G4 G2 G0             */
+	"packuswb  %%mm2, %%mm2\n" /* saturate to 0-255             */
+	"paddsw    %%mm6, %%mm1\n" /* mm1 = R6 R4 R2 R0             */
+	"packuswb  %%mm1, %%mm1\n" /* saturate to 0-255             */
+	"paddsw    %%mm7, %%mm4\n" /* mm4 = R7 R5 R3 R1             */
+	"packuswb  %%mm4, %%mm4\n" /* saturate to 0-255             */
+	"paddsw    %%mm7, %%mm5\n" /* mm5 = G7 G5 G3 G1             */
+	"packuswb  %%mm5, %%mm5\n" /* saturate to 0-255             */
+	"punpcklbw %%mm4, %%mm1\n" /* mm1 = R7 R6 R5 R4 R3 R2 R1 R0 */
+	"punpcklbw %%mm5, %%mm2\n" /* mm2 = G7 G6 G5 G4 G3 G2 G1 G0 */
+	: /* no outputs */
+	: "m" (*py), "m" (yuv.subYw), "m" (*pu), "m" (yuv.mmx00ff),
+	  "i" (8), "m" (*pv), "i" (3), "m" (yuv.Y_coeff),
+	  "i" (3), "m" (yuv.mmx0080), "m" (yuv.Y_coeff), "i" (3),
+	  "m" (yuv.mmx0080), "i" (3), "m" (yuv.U_blue), "m" (yuv.V_red),
+	  "m" (yuv.U_green), "m" (yuv.V_green));
+}
+
+static __inline__ void
+mmx_pack8888 (uint8_t *image)
+{
+    __asm__ __volatile__ (
+	"pxor      %%mm3, %%mm3\n"
+	"movq      %%mm0, %%mm6\n"
+	"punpcklbw %%mm2, %%mm6\n"
+	"movq      %%mm1, %%mm7\n"
+	"punpcklbw %%mm3, %%mm7\n"
+	"movq      %%mm0, %%mm4\n"
+	"punpcklwd %%mm7, %%mm6\n"
+	"movq      %%mm1, %%mm5\n"
+	"movq      %%mm6, (%0)\n"
+	"movq      %%mm0, %%mm6\n"
+	"punpcklbw %%mm2, %%mm6\n"
+	"punpckhwd %%mm7, %%mm6\n"
+	"movq      %%mm6, 8(%0)\n"
+	"punpckhbw %%mm2, %%mm4\n"
+	"punpckhbw %%mm3, %%mm5\n"
+	"punpcklwd %%mm5, %%mm4\n"
+	"movq      %%mm4, 16(%0)\n"
+	"movq      %%mm0, %%mm4\n"
+	"punpckhbw %%mm2, %%mm4\n"
+	"punpckhwd %%mm5, %%mm4\n"
+	"movq      %%mm4, 24(%0)\n"
+	: /* no outputs */
+	: "r" (image) );
+}
+
+static __inline__ uint32_t
+loadyuv (uint8_t *py,
+	 uint8_t *pu,
+	 uint8_t *pv)
+{
+    int16_t y, u, v;
+    int32_t r, g, b;
+
+    y = *py - 16;
+    u = *pu - 128;
+    v = *pv - 128;
+
+    /* R = 1.164(Y - 16) + 1.596(V - 128) */
+    r = 0x012b27 * y + 0x019a2e * v;
+    /* G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) */
+    g = 0x012b27 * y - 0x00d0f2 * v - 0x00647e * u;
+    /* B = 1.164(Y - 16) + 2.018(U - 128) */
+    b = 0x012b27 * y + 0x0206a2 * u;
+
+    return 0xff000000 |
+	(r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) |
+	(g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) |
+	(b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0);
+}
+
+static __inline__ uint8_t *
+loadyv12_scanline (ScanlineBuf *slb,
+		   int	       y,
+		   uint8_t       *srcY,
+		   int	       yStride,
+		   uint8_t       *srcU,
+		   uint8_t       *srcV,
+		   int	       uvStride,
+		   int	       x,
+		   int	       width)
+{
+    uint8_t *py, *pu, *pv, *pd;
+    int   i, w;
+
+    y = _y_to_scanline (slb, y);
+
+    for (i = 0; slb->lock[i]; i++);
+
+    slb->y[i]    = y;
+    slb->lock[i] = TRUE;
+
+    py = srcY + yStride  * (y >> 0);
+    pu = srcU + uvStride * (y >> 1);
+    pv = srcV + uvStride * (y >> 1);
+
+    pd = slb->line[i];
+
+    w = width;
+
+    while (w && (unsigned long) py & 7)
+    {
+	*((uint32_t *) pd) = loadyuv (py, pu, pv);
+
+	pd += 4;
+	py += 1;
+
+	if (w & 1)
+	{
+	    pu += 1;
+	    pv += 1;
+	}
+
+	w--;
+    }
+
+    while (w >= 8)
+    {
+	mmx_loadyv12 (py, pu, pv);
+	mmx_pack8888 (pd);
+
+	py += 8;
+	pu += 4;
+	pv += 4;
+	pd += 32;
+
+	w -= 8;
+    }
+
+    while (w)
+    {
+	*((uint32_t *) pd) = loadyuv (py, pu, pv);
+
+	pd += 4;
+	py += 1;
+
+	if (w & 1)
+	{
+	    pu += 1;
+	    pv += 1;
+	}
+
+	w--;
+    }
+
+    return slb->line[i];
+}
+
+static __inline__ uint8_t *
+loadyuy2_scanline (ScanlineBuf *slb,
+		   int	       y,
+		   uint8_t       *src,
+		   int	       stride,
+		   int	       x,
+		   int	       width)
+{
+    uint8_t *py, *pu, *pv, *pd;
+    int   i, w;
+
+    y = _y_to_scanline (slb, y);
+
+    for (i = 0; slb->lock[i]; i++);
+
+    slb->y[i]    = y;
+    slb->lock[i] = TRUE;
+
+    py = src + stride * (y >> 0);
+    pu = py + 1;
+    pv = py + 3;
+
+    pd = slb->line[i];
+
+    w = width;
+
+    while (w)
+    {
+	*((uint32_t *) pd) = loadyuv (py, pu, pv);
+
+	pd += 4;
+	py += 2;
+
+	if (w & 1)
+	{
+	    pu += 4;
+	    pv += 4;
+	}
+
+	w--;
+    }
+
+    return slb->line[i];
+}
+
+/* TODO: MMX code for bilinear interpolation */
+void
+fbCompositeSrc_yv12x8888mmx (pixman_op_t      op,
+			     pixman_image_t * pSrc,
+			     pixman_image_t * pMask,
+			     pixman_image_t * pDst,
+			     int16_t      xSrc,
+			     int16_t      ySrc,
+			     int16_t      xMask,
+			     int16_t      yMask,
+			     int16_t      xDst,
+			     int16_t      yDst,
+			     uint16_t     width,
+			     uint16_t     height)
+{
+    pixman_transform_t *transform = pSrc->common.transform;
+    uint8_t	  *dst, *srcY, *srcU, *srcV;
+    uint32_t	  *srcBits = pSrc->bits.bits;
+    int		  srcStride, uvStride;
+    uint32_t	  *dstBits = pDst->bits.bits;
+    int		  dstStride;
+    int		  offset, w;
+    uint8_t	  *pd;
+
+    dst = (uint8_t *) dstBits;
+    dstStride = pDst->bits.rowstride * sizeof (uint32_t);
+
+    srcY = (uint8_t *) srcBits;
+    srcStride = pSrc->bits.rowstride;
+
+    if (srcStride < 0)
+    {
+	offset = ((-srcStride) >> 1) * ((pSrc->bits.height - 1) >> 1) -
+	    srcStride;
+	srcV = (uint8_t *) (srcBits + offset);
+	offset += ((-srcStride) >> 1) * ((pSrc->bits.height) >> 1);
+	srcU = (uint8_t *) (srcBits + offset);
+    }
+    else
+    {
+	offset = srcStride * pSrc->bits.height;
+
+	srcV = (uint8_t *) (srcBits + offset);
+	srcU = (uint8_t *) (srcBits + offset + (offset >> 2));
+    }
+
+    srcStride *= sizeof (uint32_t);
+    uvStride = srcStride >> 1;
+
+    if (transform)
+    {
+	/* transformation is a Y coordinate flip, this is achieved by
+	   moving start offsets for each plane and changing sign of stride */
+	if (transform->matrix[0][0] == (1 << 16)  &&
+	    transform->matrix[1][1] == -(1 << 16) &&
+	    transform->matrix[0][2] == 0          &&
+	    transform->matrix[1][2] == (pSrc->bits.height << 16))
+	{
+	    srcY = srcY + ((pSrc->bits.height >> 0) - 1) * srcStride;
+	    srcU = srcU + ((pSrc->bits.height >> 1) - 1) * uvStride;
+	    srcV = srcV + ((pSrc->bits.height >> 1) - 1) * uvStride;
+
+	    srcStride = -srcStride;
+	    uvStride  = -uvStride;
+
+	    transform = 0;
+	}
+    }
+
+    dst += dstStride * yDst + (xDst << 2);
+
+    if (transform)
+    {
+	ScanlineBuf slb;
+	uint8_t	    _scanline_buf[8192];
+	uint8_t	    *ps0, *ps1;
+	int	    x, x0, y, line, xStep, yStep;
+	int         distx, idistx, disty, idisty;
+	int	    srcEnd = pSrc->bits.width << 16;
+	int	    srcEndIndex = (pSrc->bits.width - 1) << 16;
+
+	xStep = transform->matrix[0][0];
+	yStep = transform->matrix[1][1];
+
+	x0 = transform->matrix[0][2] + xStep * xSrc;
+	y  = transform->matrix[1][2] + yStep * ySrc;
+
+	init_scanline_buffer (&slb,
+			      _scanline_buf, sizeof (_scanline_buf),
+			      pSrc->bits.width << 2,
+			      pSrc->bits.height);
+
+	while (height--)
+	{
+	    disty  = (y >> 8) & 0xff;
+	    idisty = 256 - disty;
+	    line   = y >> 16;
+
+	    ps0 = get_scanline (&slb, line);
+	    ps1 = get_scanline (&slb, line + 1);
+
+	    if (!ps0)
+		ps0 = loadyv12_scanline (&slb, line,
+					 srcY, srcStride, srcU, srcV, uvStride,
+					 0, pSrc->bits.width);
+
+	    if (!ps1)
+		ps1 = loadyv12_scanline (&slb, line + 1,
+					 srcY, srcStride, srcU, srcV, uvStride,
+					 0, pSrc->bits.width);
+
+	    pd = dst;
+
+	    x = x0;
+	    w = width;
+
+	    if (pSrc->common.filter == PIXMAN_FILTER_BILINEAR)
+	    {
+		while (w && x < 0)
+		{
+		    *(uint32_t *) pd = fetch_bilinear_8888 (0, 256, disty, idisty,
+							  ps0, ps1, 0);
+
+		    x  += xStep;
+		    pd += 4;
+		    w  -= 1;
+		}
+
+		while (w && x < srcEndIndex)
+		{
+		    distx  = (x >> 8) & 0xff;
+		    idistx = 256 - distx;
+
+		    *(uint32_t *) pd = fetch_bilinear_8888 (distx, idistx,
+							  disty, idisty,
+							  ps0, ps1,
+							  (x >> 14) & ~3);
+
+		    x  += xStep;
+		    pd += 4;
+		    w  -= 1;
+		}
+
+		while (w)
+		{
+		    *(uint32_t *) pd = fetch_bilinear_8888 (256, 0,
+							  disty, idisty,
+							  ps0, ps1,
+							  (x >> 14) & ~3);
+
+		    pd += 4;
+		    w  -= 1;
+		}
+	    }
+	    else
+	    {
+		while (w && x < 0)
+		{
+		    *(uint32_t *) pd = *(uint32_t *) ps0;
+
+		    x  += xStep;
+		    pd += 4;
+		    w  -= 1;
+		}
+
+		while (w && x < srcEnd)
+		{
+		    *(uint32_t *) pd = ((uint32_t *) ps0)[x >> 16];
+
+		    x  += xStep;
+		    pd += 4;
+		    w  -= 1;
+		}
+
+		while (w)
+		{
+		    *(uint32_t *) pd = ((uint32_t *) ps0)[x >> 16];
+
+		    pd += 4;
+		    w  -= 1;
+		}
+	    }
+
+	    y   += yStep;
+	    dst += dstStride;
+
+	    release_scanlines (&slb);
+	}
+
+	fini_scanline_buffer (&slb);
+    }
+    else
+    {
+	uint8_t *py, *pu, *pv;
+
+	srcY += srcStride * (ySrc >> 0) + (xSrc >> 0);
+	srcU += uvStride  * (ySrc >> 1) + (xSrc >> 1);
+	srcV += uvStride  * (ySrc >> 1) + (xSrc >> 1);
+
+	while (height)
+	{
+	    py = srcY;
+	    pu = srcU;
+	    pv = srcV;
+	    pd = dst;
+
+	    w = width;
+
+	    while (w && (unsigned long) py & 7)
+	    {
+		*((uint32_t *) pd) = loadyuv (py, pu, pv);
+
+		pd += 4;
+		py += 1;
+
+		if (w & 1)
+		{
+		    pu += 1;
+		    pv += 1;
+		}
+
+		w--;
+	    }
+
+	    while (w >= 8)
+	    {
+		mmx_loadyv12 (py, pu, pv);
+		mmx_pack8888 (pd);
+
+		py += 8;
+		pu += 4;
+		pv += 4;
+		pd += 32;
+
+		w -= 8;
+	    }
+
+	    while (w)
+	    {
+		*((uint32_t *) pd) = loadyuv (py, pu, pv);
+
+		pd += 4;
+		py += 1;
+
+		if (w & 1)
+		{
+		    pu += 1;
+		    pv += 1;
+		}
+
+		w--;
+	    }
+
+	    dst  += dstStride;
+	    srcY += srcStride;
+
+	    if (height & 1)
+	    {
+		srcU += uvStride;
+		srcV += uvStride;
+	    }
+
+	    height--;
+	}
+    }
+
+    _mm_empty ();
+}
+
+/* TODO: MMX code for yuy2 */
+void
+fbCompositeSrc_yuy2x8888mmx (pixman_op_t      op,
+			     pixman_image_t * pSrc,
+			     pixman_image_t * pMask,
+			     pixman_image_t * pDst,
+			     int16_t      xSrc,
+			     int16_t      ySrc,
+			     int16_t      xMask,
+			     int16_t      yMask,
+			     int16_t      xDst,
+			     int16_t      yDst,
+			     uint16_t     width,
+			     uint16_t     height)
+{
+    pixman_transform_t *transform = pSrc->common.transform;
+    uint8_t	  *dst, *src;
+    uint32_t	  *srcBits = pSrc->bits.bits;
+    int		  srcStride;
+    uint32_t	  *dstBits = pDst->bits.bits;
+    int		  dstStride;
+    int		  w;
+    uint8_t	  *pd;
+
+    dst = (uint8_t *) dstBits;
+    dstStride = pDst->bits.rowstride * sizeof (uint32_t);
+
+    src = (uint8_t *) srcBits;
+    srcStride = pSrc->bits.rowstride * sizeof (uint32_t);
+
+    if (transform)
+    {
+	/* transformation is a Y coordinate flip, this is achieved by
+	   moving start offsets for each plane and changing sign of stride */
+	if (transform->matrix[0][0] == (1 << 16)  &&
+	    transform->matrix[1][1] == -(1 << 16) &&
+	    transform->matrix[0][2] == 0          &&
+	    transform->matrix[1][2] == (pSrc->bits.height << 16))
+	{
+	    src = src + (pSrc->bits.height - 1) * srcStride;
+
+	    srcStride = -srcStride;
+
+	    transform = 0;
+	}
+    }
+
+    dst += dstStride * yDst + (xDst << 2);
+
+    if (transform)
+    {
+	ScanlineBuf slb;
+	uint8_t	    _scanline_buf[8192];
+	uint8_t	    *ps0, *ps1;
+	int	    x, x0, y, line, xStep, yStep;
+	int         distx, idistx, disty, idisty;
+	int	    srcEnd = pSrc->bits.width << 16;
+	int	    srcEndIndex = (pSrc->bits.width - 1) << 16;
+
+	xStep = transform->matrix[0][0];
+	yStep = transform->matrix[1][1];
+
+	x0 = transform->matrix[0][2] + xStep * xSrc;
+	y  = transform->matrix[1][2] + yStep * ySrc;
+
+	init_scanline_buffer (&slb,
+			      _scanline_buf, sizeof (_scanline_buf),
+			      pSrc->bits.width << 2,
+			      pSrc->bits.height);
+
+	while (height--)
+	{
+	    disty  = (y >> 8) & 0xff;
+	    idisty = 256 - disty;
+	    line   = y >> 16;
+
+	    ps0 = get_scanline (&slb, line);
+	    ps1 = get_scanline (&slb, line + 1);
+
+	    if (!ps0)
+		ps0 = loadyuy2_scanline (&slb, line,
+					 src, srcStride,
+					 0, pSrc->bits.width);
+
+	    if (!ps1)
+		ps1 = loadyuy2_scanline (&slb, line + 1,
+					 src, srcStride,
+					 0, pSrc->bits.width);
+
+	    pd = dst;
+
+	    x = x0;
+	    w = width;
+
+	    if (pSrc->common.filter == PIXMAN_FILTER_BILINEAR)
+	    {
+		while (w && x < 0)
+		{
+		    *(uint32_t *) pd = fetch_bilinear_8888 (0, 256, disty, idisty,
+							  ps0, ps1, 0);
+
+		    x  += xStep;
+		    pd += 4;
+		    w  -= 1;
+		}
+
+		while (w && x < srcEndIndex)
+		{
+		    distx  = (x >> 8) & 0xff;
+		    idistx = 256 - distx;
+
+		    *(uint32_t *) pd = fetch_bilinear_8888 (distx, idistx,
+							  disty, idisty,
+							  ps0, ps1,
+							  (x >> 14) & ~3);
+
+		    x  += xStep;
+		    pd += 4;
+		    w  -= 1;
+		}
+
+		while (w)
+		{
+		    *(uint32_t *) pd = fetch_bilinear_8888 (256, 0, disty, idisty,
+							  ps0, ps1,
+							  (x >> 14) & ~3);
+
+		    pd += 4;
+		    w  -= 1;
+		}
+	    }
+	    else
+	    {
+		while (w && x < 0)
+		{
+		    *(uint32_t *) pd = *(uint32_t *) ps0;
+
+		    x  += xStep;
+		    pd += 4;
+		    w  -= 1;
+		}
+
+		while (w && x < srcEnd)
+		{
+		    *(uint32_t *) pd = ((uint32_t *) ps0)[x >> 16];
+
+		    x  += xStep;
+		    pd += 4;
+		    w  -= 1;
+		}
+
+		while (w)
+		{
+		    *(uint32_t *) pd = ((uint32_t *) ps0)[x >> 16];
+
+		    pd += 4;
+		    w  -= 1;
+		}
+	    }
+
+	    y   += yStep;
+	    dst += dstStride;
+
+	    release_scanlines (&slb);
+	}
+
+	fini_scanline_buffer (&slb);
+    }
+    else
+    {
+	uint8_t *py, *pu, *pv;
+
+	src += srcStride * (ySrc >> 0) + xSrc;
+
+	while (height)
+	{
+	    py = src;
+	    pu = src + 1;
+	    pv = src + 3;
+	    pd = dst;
+
+	    w = width;
+
+	    while (w)
+	    {
+		*((uint32_t *) pd) = loadyuv (py, pu, pv);
+
+		pd += 4;
+		py += 2;
+
+		if (w & 1)
+		{
+		    pu += 4;
+		    pv += 4;
+		}
+
+		w--;
+	    }
+
+	    dst += dstStride;
+	    src += srcStride;
+
+	    height--;
+	}
+    }
+}
 
 
 #endif /* USE_MMX */
diff --git a/pixman/pixman-mmx.h b/pixman/pixman-mmx.h
index a74d4ba..a8e27e4 100644
--- a/pixman/pixman-mmx.h
+++ b/pixman/pixman-mmx.h
@@ -312,4 +312,32 @@ fbCompositeOver_x888x8x8888mmx (pixman_op_t      op,
 				uint16_t     width,
 				uint16_t     height);
 
+void
+fbCompositeSrc_yv12x8888mmx (pixman_op_t      op,
+			     pixman_image_t * pSrc,
+			     pixman_image_t * pMask,
+			     pixman_image_t * pDst,
+			     int16_t    xSrc,
+			     int16_t    ySrc,
+			     int16_t    xMask,
+			     int16_t    yMask,
+			     int16_t    xDst,
+			     int16_t    yDst,
+			     uint16_t   width,
+			     uint16_t   height);
+
+void
+fbCompositeSrc_yuy2x8888mmx (pixman_op_t      op,
+			     pixman_image_t * pSrc,
+			     pixman_image_t * pMask,
+			     pixman_image_t * pDst,
+			     int16_t    xSrc,
+			     int16_t    ySrc,
+			     int16_t    xMask,
+			     int16_t    yMask,
+			     int16_t    xDst,
+			     int16_t    yDst,
+			     uint16_t   width,
+			     uint16_t   height);
+
 #endif /* USE_MMX */
diff --git a/pixman/pixman-pict.c b/pixman/pixman-pict.c
index c7d73fc..d2bc3d3 100644
--- a/pixman/pixman-pict.c
+++ b/pixman/pixman-pict.c
@@ -1437,7 +1437,35 @@ pixman_image_composite (pixman_op_t      op,
 	    maskTransform = FALSE;
     }
 
-    if ((pSrc->type == BITS || can_get_solid (pSrc)) && (!pMask || pMask->type == BITS)
+    /* YUV is only used internally for XVideo */
+    if (pSrc->bits.format == PIXMAN_yv12 || pSrc->bits.format == PIXMAN_yuy2)
+    {
+#ifdef USE_MMX
+	/* non rotating transformation */
+	if (!pSrc->common.transform ||
+	    (pSrc->common.transform->matrix[0][1] == 0 &&
+	     pSrc->common.transform->matrix[1][0] == 0 &&
+	     pSrc->common.transform->matrix[2][0] == 0 &&
+	     pSrc->common.transform->matrix[2][1] == 0 &&
+	     pSrc->common.transform->matrix[2][2] == 1 << 16))
+	{
+	    switch (pDst->bits.format) {
+	    case PIXMAN_a8r8g8b8:
+	    case PIXMAN_x8r8g8b8:
+		if (pixman_have_mmx())
+		{
+		    if (pSrc->bits.format == PIXMAN_yv12)
+			func = fbCompositeSrc_yv12x8888mmx;
+		    else
+			func = fbCompositeSrc_yuy2x8888mmx;
+		}
+	    default:
+		break;
+	    }
+	}
+#endif
+    }
+    else if ((pSrc->type == BITS || can_get_solid (pSrc)) && (!pMask || pMask->type == BITS)
         && !srcTransform && !maskTransform
         && !maskAlphaMap && !srcAlphaMap && !dstAlphaMap
         && (pSrc->common.filter != PIXMAN_FILTER_CONVOLUTION)
-- 
2.7.4