MMX code path for YV12 copy, taken from the xserver glucose-2 branch.
authorJosé Fonseca <jrfonseca@tungstengraphics.com>
Fri, 7 Sep 2007 09:52:01 +0000 (10:52 +0100)
committerJosé Fonseca <jrfonseca@tungstengraphics.com>
Fri, 7 Sep 2007 09:52:01 +0000 (10:52 +0100)
pixman/pixman-mmx.c
pixman/pixman-mmx.h
pixman/pixman-pict.c

index 8c7be6d..4fb2579 100644 (file)
@@ -30,6 +30,9 @@
  */
 #include <config.h>
 
+#include <stdlib.h>
+#include <limits.h>
+
 #ifdef USE_MMX
 
 #if defined(__amd64__) || defined(__x86_64__)
@@ -917,6 +920,188 @@ void fbComposeSetupMMX(void)
     } 
 }
 
+static __inline__ uint8_t
+interpolate_bilinear (int   distx,
+                     int   idistx,
+                     int   disty,
+                     int   idisty,
+                     uint8_t tl,
+                     uint8_t tr,
+                     uint8_t bl,
+                     uint8_t br)
+{
+    return ((tl * idistx + tr * distx) * idisty +
+           (bl * idistx + br * distx) * disty) >> 16;
+}
+
+static __inline__ uint32_t
+interpolate_bilinear_8888 (int   distx,
+                          int   idistx,
+                          int   disty,
+                          int   idisty,
+                          uint8_t *l00,
+                          uint8_t *l01,
+                          uint8_t *l10,
+                          uint8_t *l11,
+                          int   x00,
+                          int   x01,
+                          int   x10,
+                          int   x11)
+{
+    uint8_t buffer[4];
+
+    buffer[0] = interpolate_bilinear (distx, idistx, disty, idisty,
+                                     l00[x00], l01[x01],
+                                     l10[x10], l11[x11]);
+
+    buffer[1] = interpolate_bilinear (distx, idistx, disty, idisty,
+                                     l00[x00 + 1], l01[x01 + 1],
+                                     l10[x10 + 1], l11[x11 + 1]);
+
+    buffer[2] = interpolate_bilinear (distx, idistx, disty, idisty,
+                                     l00[x00 + 2], l01[x01 + 2],
+                                     l10[x10 + 2], l11[x11 + 2]);
+
+    buffer[3] = interpolate_bilinear (distx, idistx, disty, idisty,
+                                     l00[x00 + 3], l01[x01 + 3],
+                                     l10[x10 + 3], l11[x11 + 3]);
+
+    return *((uint32_t *) buffer);
+}
+
+static __inline__ uint32_t
+fetch_bilinear2_8888 (int   distx,
+                     int   idistx,
+                     int   disty,
+                     int   idisty,
+                     uint8_t *l0,
+                     uint8_t *l1,
+                     int   x0,
+                     int   x1)
+{
+    return interpolate_bilinear_8888 (distx,
+                                     idistx,
+                                     disty,
+                                     idisty,
+                                     l0,
+                                     l0,
+                                     l1,
+                                     l1,
+                                     x0,
+                                     x0 + 4,
+                                     x1,
+                                     x1 + 4);
+}
+
+static __inline__ uint32_t
+fetch_bilinear_8888 (int   distx,
+                    int   idistx,
+                    int   disty,
+                    int   idisty,
+                    uint8_t *l0,
+                    uint8_t *l1,
+                    int   x)
+{
+    return fetch_bilinear2_8888 (distx, idistx, disty, idisty, l0, l1, x, x);
+}
+
+static uint32_t _zero32x2[2] = { 0x0, 0x0 };
+static uint8_t  *_zero8x8 = (uint8_t *) _zero32x2;
+
+static __inline__ int
+set_scale_steps (uint32_t   *src,
+                int srcStride,
+                int      xStart,
+                int      xStep,
+                int      width,
+                int      line,
+                int      lastLine,
+                pixman_repeat_t repeatType,
+                uint8_t          **s0,
+                uint8_t          **s1,
+                int      *x0,
+                int      *x0Step,
+                int      *x1,
+                int      *x1Step)
+{
+    if (line < 0)
+    {
+       if (repeatType == PIXMAN_REPEAT_PAD)
+       {
+           *s0 = (uint8_t *) src;
+           *s1 = (uint8_t *) src;
+
+           *x0     = xStart;
+           *x0Step = xStep;
+           *x1     = xStart;
+           *x1Step = xStep;
+       }
+       else
+       {
+           if (line == -1)
+           {
+               *s0 = _zero8x8;
+
+               *x0     = 0;
+               *x0Step = 0;
+
+               *s1 = (uint8_t *) src;
+
+               *x1     = xStart;
+               *x1Step = xStep;
+           }
+           else
+           {
+               return 0;
+           }
+       }
+    }
+    else if (line >= lastLine)
+    {
+       if (repeatType == PIXMAN_REPEAT_PAD)
+       {
+           *s0 = (uint8_t *) (src + srcStride * lastLine);
+           *s1 = (uint8_t *) (src + srcStride * lastLine);
+
+           *x0     = xStart;
+           *x0Step = xStep;
+           *x1     = xStart;
+           *x1Step = xStep;
+       }
+       else
+       {
+           if (line == lastLine)
+           {
+               *s0 = (uint8_t *) (src + srcStride * line);
+
+               *x0     = xStart;
+               *x0Step = xStep;
+
+               *s1 = _zero8x8;
+
+               *x1     = 0;
+               *x1Step = 0;
+           }
+           else
+           {
+               return 0;
+           }
+       }
+    }
+    else
+    {
+       *s0 = (uint8_t *) (src + srcStride * line);
+       *s1 = (uint8_t *) (src + srcStride * (line + 1));
+
+       *x0     = xStart;
+       *x0Step = xStep;
+       *x1     = xStart;
+       *x1Step = xStep;
+    }
+
+    return width;
+}
+
 
 /* ------------------ MMX code paths called from fbpict.c ----------------------- */
 
@@ -2970,6 +3155,825 @@ fbCompositeOver_x888x8x8888mmx (pixman_op_t      op,
     _mm_empty();
 }
 
+typedef struct _ScanlineBuf {
+    pixman_bool_t lock[2];
+    int    y[2];
+    uint8_t *line[2];
+    int   height;
+    uint8_t *heap;
+} ScanlineBuf;
+
+static pixman_bool_t
+init_scanline_buffer (ScanlineBuf *slb,
+                     uint8_t     *buffer,
+                     int         size,
+                     int         length,
+                     int         height)
+{
+    int i, s;
+
+    s = length << 1;
+
+    if (size < s)
+    {
+       slb->heap = malloc (s);
+       if (!slb->heap)
+           return FALSE;
+
+       buffer = slb->heap;
+    }
+    else
+    {
+       slb->heap = NULL;
+    }
+
+    for (i = 0; i < 2; i++)
+    {
+       slb->lock[i] = FALSE;
+       slb->y[i]    = SHRT_MAX;
+       slb->line[i] = buffer;
+
+       buffer += length;
+    }
+
+    slb->height = height;
+
+    return TRUE;
+}
+
+static void
+fini_scanline_buffer (ScanlineBuf *slb)
+{
+    if (slb->heap)
+       free (slb->heap);
+}
+
+static __inline__ void
+release_scanlines (ScanlineBuf *slb)
+{
+    int i;
+
+    for (i = 0; i < 2; i++)
+       slb->lock[i] = FALSE;
+}
+
+static __inline__ int
+_y_to_scanline (ScanlineBuf *slb,
+               int         y)
+{
+    return (y < 0) ? 0 : (y >= slb->height) ? slb->height - 1 : y;
+}
+
+static __inline__ uint8_t *
+get_scanline (ScanlineBuf *slb,
+             int         y)
+{
+    int i;
+
+    y = _y_to_scanline (slb, y);
+
+    for (i = 0; i < 2; i++)
+    {
+       if (slb->y[i] == y)
+       {
+           slb->lock[i] = TRUE;
+           return slb->line[i];
+       }
+    }
+
+    return NULL;
+}
+
+typedef struct {
+    ullong subYw;
+    ullong U_green;
+    ullong U_blue;
+    ullong V_red;
+    ullong V_green;
+    ullong Y_coeff;
+    ullong mmx0080;
+    ullong mmx00ff;
+} YUVData;
+
+static const YUVData yuv = {
+    .subYw   = 0x1010101010101010ULL,
+    .U_green = 0xf377f377f377f377ULL,
+    .U_blue  = 0x408d408d408d408dULL,
+    .V_red   = 0x3313331333133313ULL,
+    .V_green = 0xe5fce5fce5fce5fcULL,
+    .Y_coeff = 0x2543254325432543ULL,
+    .mmx0080 = 0x0080008000800080ULL,
+    .mmx00ff = 0x00ff00ff00ff00ffULL
+};
+
+static __inline__ void
+mmx_loadyv12 (uint8_t *py,
+             uint8_t *pu,
+             uint8_t *pv)
+{
+    __asm__ __volatile__ (
+       "movq      %0,    %%mm6\n" /* mm6 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
+       "pxor      %%mm4, %%mm4\n" /* mm4 = 0                       */
+       "psubusb   %1,    %%mm6\n" /* Y -= 16                       */
+       "movd      %2,    %%mm0\n" /* mm0 = 00 00 00 00 U3 U2 U1 U0 */
+       "movq      %%mm6, %%mm7\n" /* mm7 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
+       "pand      %3,    %%mm6\n" /* mm6 =    Y6    Y4    Y2    Y0 */
+       "psrlw     %4,    %%mm7\n" /* mm7 =    Y7    Y5    Y3    Y1 */
+       "movd      %5,    %%mm1\n" /* mm1 = 00 00 00 00 V3 V2 V1 V0 */
+       "psllw     %6,    %%mm6\n" /* promote precision             */
+       "pmulhw    %7,    %%mm6\n" /* mm6 = luma_rgb even           */
+       "psllw     %8,    %%mm7\n" /* promote precision             */
+       "punpcklbw %%mm4, %%mm0\n" /* mm0 = U3 U2 U1 U0             */
+       "psubsw    %9,    %%mm0\n" /* U -= 128                      */
+       "punpcklbw %%mm4, %%mm1\n" /* mm1 = V3 V2 V1 V0             */
+       "pmulhw    %10,   %%mm7\n" /* mm7 = luma_rgb odd            */
+       "psllw     %11,   %%mm0\n" /* promote precision             */
+       "psubsw    %12,   %%mm1\n" /* V -= 128                      */
+       "movq      %%mm0, %%mm2\n" /* mm2 = U3 U2 U1 U0             */
+       "psllw     %13,   %%mm1\n" /* promote precision             */
+       "movq      %%mm1, %%mm4\n" /* mm4 = V3 V2 V1 V0             */
+       "pmulhw    %14,   %%mm0\n" /* mm0 = chroma_b                */
+       "pmulhw    %15,   %%mm1\n" /* mm1 = chroma_r                */
+       "movq      %%mm0, %%mm3\n" /* mm3 = chroma_b                */
+       "paddsw    %%mm6, %%mm0\n" /* mm0 = B6 B4 B2 B0             */
+       "paddsw    %%mm7, %%mm3\n" /* mm3 = B7 B5 B3 B1             */
+       "packuswb  %%mm0, %%mm0\n" /* saturate to 0-255             */
+       "pmulhw    %16,   %%mm2\n" /* mm2 = U * U_green             */
+       "packuswb  %%mm3, %%mm3\n" /* saturate to 0-255             */
+       "punpcklbw %%mm3, %%mm0\n" /* mm0 = B7 B6 B5 B4 B3 B2 B1 B0 */
+       "pmulhw    %17,   %%mm4\n" /* mm4 = V * V_green             */
+       "paddsw    %%mm4, %%mm2\n" /* mm2 = chroma_g                */
+       "movq      %%mm2, %%mm5\n" /* mm5 = chroma_g                */
+       "movq      %%mm1, %%mm4\n" /* mm4 = chroma_r                */
+       "paddsw    %%mm6, %%mm2\n" /* mm2 = G6 G4 G2 G0             */
+       "packuswb  %%mm2, %%mm2\n" /* saturate to 0-255             */
+       "paddsw    %%mm6, %%mm1\n" /* mm1 = R6 R4 R2 R0             */
+       "packuswb  %%mm1, %%mm1\n" /* saturate to 0-255             */
+       "paddsw    %%mm7, %%mm4\n" /* mm4 = R7 R5 R3 R1             */
+       "packuswb  %%mm4, %%mm4\n" /* saturate to 0-255             */
+       "paddsw    %%mm7, %%mm5\n" /* mm5 = G7 G5 G3 G1             */
+       "packuswb  %%mm5, %%mm5\n" /* saturate to 0-255             */
+       "punpcklbw %%mm4, %%mm1\n" /* mm1 = R7 R6 R5 R4 R3 R2 R1 R0 */
+       "punpcklbw %%mm5, %%mm2\n" /* mm2 = G7 G6 G5 G4 G3 G2 G1 G0 */
+       : /* no outputs */
+       : "m" (*py), "m" (yuv.subYw), "m" (*pu), "m" (yuv.mmx00ff),
+         "i" (8), "m" (*pv), "i" (3), "m" (yuv.Y_coeff),
+         "i" (3), "m" (yuv.mmx0080), "m" (yuv.Y_coeff), "i" (3),
+         "m" (yuv.mmx0080), "i" (3), "m" (yuv.U_blue), "m" (yuv.V_red),
+         "m" (yuv.U_green), "m" (yuv.V_green));
+}
+
+static __inline__ void
+mmx_pack8888 (uint8_t *image)
+{
+    __asm__ __volatile__ (
+       "pxor      %%mm3, %%mm3\n"
+       "movq      %%mm0, %%mm6\n"
+       "punpcklbw %%mm2, %%mm6\n"
+       "movq      %%mm1, %%mm7\n"
+       "punpcklbw %%mm3, %%mm7\n"
+       "movq      %%mm0, %%mm4\n"
+       "punpcklwd %%mm7, %%mm6\n"
+       "movq      %%mm1, %%mm5\n"
+       "movq      %%mm6, (%0)\n"
+       "movq      %%mm0, %%mm6\n"
+       "punpcklbw %%mm2, %%mm6\n"
+       "punpckhwd %%mm7, %%mm6\n"
+       "movq      %%mm6, 8(%0)\n"
+       "punpckhbw %%mm2, %%mm4\n"
+       "punpckhbw %%mm3, %%mm5\n"
+       "punpcklwd %%mm5, %%mm4\n"
+       "movq      %%mm4, 16(%0)\n"
+       "movq      %%mm0, %%mm4\n"
+       "punpckhbw %%mm2, %%mm4\n"
+       "punpckhwd %%mm5, %%mm4\n"
+       "movq      %%mm4, 24(%0)\n"
+       : /* no outputs */
+       : "r" (image) );
+}
+
+static __inline__ uint32_t
+loadyuv (uint8_t *py,
+        uint8_t *pu,
+        uint8_t *pv)
+{
+    int16_t y, u, v;
+    int32_t r, g, b;
+
+    y = *py - 16;
+    u = *pu - 128;
+    v = *pv - 128;
+
+    /* R = 1.164(Y - 16) + 1.596(V - 128) */
+    r = 0x012b27 * y + 0x019a2e * v;
+    /* G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) */
+    g = 0x012b27 * y - 0x00d0f2 * v - 0x00647e * u;
+    /* B = 1.164(Y - 16) + 2.018(U - 128) */
+    b = 0x012b27 * y + 0x0206a2 * u;
+
+    return 0xff000000 |
+       (r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) |
+       (g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) |
+       (b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0);
+}
+
+static __inline__ uint8_t *
+loadyv12_scanline (ScanlineBuf *slb,
+                  int         y,
+                  uint8_t       *srcY,
+                  int         yStride,
+                  uint8_t       *srcU,
+                  uint8_t       *srcV,
+                  int         uvStride,
+                  int         x,
+                  int         width)
+{
+    uint8_t *py, *pu, *pv, *pd;
+    int   i, w;
+
+    y = _y_to_scanline (slb, y);
+
+    for (i = 0; slb->lock[i]; i++);
+
+    slb->y[i]    = y;
+    slb->lock[i] = TRUE;
+
+    py = srcY + yStride  * (y >> 0);
+    pu = srcU + uvStride * (y >> 1);
+    pv = srcV + uvStride * (y >> 1);
+
+    pd = slb->line[i];
+
+    w = width;
+
+    while (w && (unsigned long) py & 7)
+    {
+       *((uint32_t *) pd) = loadyuv (py, pu, pv);
+
+       pd += 4;
+       py += 1;
+
+       if (w & 1)
+       {
+           pu += 1;
+           pv += 1;
+       }
+
+       w--;
+    }
+
+    while (w >= 8)
+    {
+       mmx_loadyv12 (py, pu, pv);
+       mmx_pack8888 (pd);
+
+       py += 8;
+       pu += 4;
+       pv += 4;
+       pd += 32;
+
+       w -= 8;
+    }
+
+    while (w)
+    {
+       *((uint32_t *) pd) = loadyuv (py, pu, pv);
+
+       pd += 4;
+       py += 1;
+
+       if (w & 1)
+       {
+           pu += 1;
+           pv += 1;
+       }
+
+       w--;
+    }
+
+    return slb->line[i];
+}
+
+static __inline__ uint8_t *
+loadyuy2_scanline (ScanlineBuf *slb,
+                  int         y,
+                  uint8_t       *src,
+                  int         stride,
+                  int         x,
+                  int         width)
+{
+    uint8_t *py, *pu, *pv, *pd;
+    int   i, w;
+
+    y = _y_to_scanline (slb, y);
+
+    for (i = 0; slb->lock[i]; i++);
+
+    slb->y[i]    = y;
+    slb->lock[i] = TRUE;
+
+    py = src + stride * (y >> 0);
+    pu = py + 1;
+    pv = py + 3;
+
+    pd = slb->line[i];
+
+    w = width;
+
+    while (w)
+    {
+       *((uint32_t *) pd) = loadyuv (py, pu, pv);
+
+       pd += 4;
+       py += 2;
+
+       if (w & 1)
+       {
+           pu += 4;
+           pv += 4;
+       }
+
+       w--;
+    }
+
+    return slb->line[i];
+}
+
+/* TODO: MMX code for bilinear interpolation */
+void
+fbCompositeSrc_yv12x8888mmx (pixman_op_t      op,
+                            pixman_image_t * pSrc,
+                            pixman_image_t * pMask,
+                            pixman_image_t * pDst,
+                            int16_t      xSrc,
+                            int16_t      ySrc,
+                            int16_t      xMask,
+                            int16_t      yMask,
+                            int16_t      xDst,
+                            int16_t      yDst,
+                            uint16_t     width,
+                            uint16_t     height)
+{
+    pixman_transform_t *transform = pSrc->common.transform;
+    uint8_t      *dst, *srcY, *srcU, *srcV;
+    uint32_t     *srcBits = pSrc->bits.bits;
+    int                  srcStride, uvStride;
+    uint32_t     *dstBits = pDst->bits.bits;
+    int                  dstStride;
+    int                  offset, w;
+    uint8_t      *pd;
+
+    dst = (uint8_t *) dstBits;
+    dstStride = pDst->bits.rowstride * sizeof (uint32_t);
+
+    srcY = (uint8_t *) srcBits;
+    srcStride = pSrc->bits.rowstride;
+
+    if (srcStride < 0)
+    {
+       offset = ((-srcStride) >> 1) * ((pSrc->bits.height - 1) >> 1) -
+           srcStride;
+       srcV = (uint8_t *) (srcBits + offset);
+       offset += ((-srcStride) >> 1) * ((pSrc->bits.height) >> 1);
+       srcU = (uint8_t *) (srcBits + offset);
+    }
+    else
+    {
+       offset = srcStride * pSrc->bits.height;
+
+       srcV = (uint8_t *) (srcBits + offset);
+       srcU = (uint8_t *) (srcBits + offset + (offset >> 2));
+    }
+
+    srcStride *= sizeof (uint32_t);
+    uvStride = srcStride >> 1;
+
+    if (transform)
+    {
+       /* transformation is a Y coordinate flip, this is achieved by
+          moving start offsets for each plane and changing sign of stride */
+       if (transform->matrix[0][0] == (1 << 16)  &&
+           transform->matrix[1][1] == -(1 << 16) &&
+           transform->matrix[0][2] == 0          &&
+           transform->matrix[1][2] == (pSrc->bits.height << 16))
+       {
+           srcY = srcY + ((pSrc->bits.height >> 0) - 1) * srcStride;
+           srcU = srcU + ((pSrc->bits.height >> 1) - 1) * uvStride;
+           srcV = srcV + ((pSrc->bits.height >> 1) - 1) * uvStride;
+
+           srcStride = -srcStride;
+           uvStride  = -uvStride;
+
+           transform = 0;
+       }
+    }
+
+    dst += dstStride * yDst + (xDst << 2);
+
+    if (transform)
+    {
+       ScanlineBuf slb;
+       uint8_t     _scanline_buf[8192];
+       uint8_t     *ps0, *ps1;
+       int         x, x0, y, line, xStep, yStep;
+       int         distx, idistx, disty, idisty;
+       int         srcEnd = pSrc->bits.width << 16;
+       int         srcEndIndex = (pSrc->bits.width - 1) << 16;
+
+       xStep = transform->matrix[0][0];
+       yStep = transform->matrix[1][1];
+
+       x0 = transform->matrix[0][2] + xStep * xSrc;
+       y  = transform->matrix[1][2] + yStep * ySrc;
+
+       init_scanline_buffer (&slb,
+                             _scanline_buf, sizeof (_scanline_buf),
+                             pSrc->bits.width << 2,
+                             pSrc->bits.height);
+
+       while (height--)
+       {
+           disty  = (y >> 8) & 0xff;
+           idisty = 256 - disty;
+           line   = y >> 16;
+
+           ps0 = get_scanline (&slb, line);
+           ps1 = get_scanline (&slb, line + 1);
+
+           if (!ps0)
+               ps0 = loadyv12_scanline (&slb, line,
+                                        srcY, srcStride, srcU, srcV, uvStride,
+                                        0, pSrc->bits.width);
+
+           if (!ps1)
+               ps1 = loadyv12_scanline (&slb, line + 1,
+                                        srcY, srcStride, srcU, srcV, uvStride,
+                                        0, pSrc->bits.width);
+
+           pd = dst;
+
+           x = x0;
+           w = width;
+
+           if (pSrc->common.filter == PIXMAN_FILTER_BILINEAR)
+           {
+               while (w && x < 0)
+               {
+                   *(uint32_t *) pd = fetch_bilinear_8888 (0, 256, disty, idisty,
+                                                         ps0, ps1, 0);
+
+                   x  += xStep;
+                   pd += 4;
+                   w  -= 1;
+               }
+
+               while (w && x < srcEndIndex)
+               {
+                   distx  = (x >> 8) & 0xff;
+                   idistx = 256 - distx;
+
+                   *(uint32_t *) pd = fetch_bilinear_8888 (distx, idistx,
+                                                         disty, idisty,
+                                                         ps0, ps1,
+                                                         (x >> 14) & ~3);
+
+                   x  += xStep;
+                   pd += 4;
+                   w  -= 1;
+               }
+
+               while (w)
+               {
+                   *(uint32_t *) pd = fetch_bilinear_8888 (256, 0,
+                                                         disty, idisty,
+                                                         ps0, ps1,
+                                                         (x >> 14) & ~3);
+
+                   pd += 4;
+                   w  -= 1;
+               }
+           }
+           else
+           {
+               while (w && x < 0)
+               {
+                   *(uint32_t *) pd = *(uint32_t *) ps0;
+
+                   x  += xStep;
+                   pd += 4;
+                   w  -= 1;
+               }
+
+               while (w && x < srcEnd)
+               {
+                   *(uint32_t *) pd = ((uint32_t *) ps0)[x >> 16];
+
+                   x  += xStep;
+                   pd += 4;
+                   w  -= 1;
+               }
+
+               while (w)
+               {
+                   *(uint32_t *) pd = ((uint32_t *) ps0)[x >> 16];
+
+                   pd += 4;
+                   w  -= 1;
+               }
+           }
+
+           y   += yStep;
+           dst += dstStride;
+
+           release_scanlines (&slb);
+       }
+
+       fini_scanline_buffer (&slb);
+    }
+    else
+    {
+       uint8_t *py, *pu, *pv;
+
+       srcY += srcStride * (ySrc >> 0) + (xSrc >> 0);
+       srcU += uvStride  * (ySrc >> 1) + (xSrc >> 1);
+       srcV += uvStride  * (ySrc >> 1) + (xSrc >> 1);
+
+       while (height)
+       {
+           py = srcY;
+           pu = srcU;
+           pv = srcV;
+           pd = dst;
+
+           w = width;
+
+           while (w && (unsigned long) py & 7)
+           {
+               *((uint32_t *) pd) = loadyuv (py, pu, pv);
+
+               pd += 4;
+               py += 1;
+
+               if (w & 1)
+               {
+                   pu += 1;
+                   pv += 1;
+               }
+
+               w--;
+           }
+
+           while (w >= 8)
+           {
+               mmx_loadyv12 (py, pu, pv);
+               mmx_pack8888 (pd);
+
+               py += 8;
+               pu += 4;
+               pv += 4;
+               pd += 32;
+
+               w -= 8;
+           }
+
+           while (w)
+           {
+               *((uint32_t *) pd) = loadyuv (py, pu, pv);
+
+               pd += 4;
+               py += 1;
+
+               if (w & 1)
+               {
+                   pu += 1;
+                   pv += 1;
+               }
+
+               w--;
+           }
+
+           dst  += dstStride;
+           srcY += srcStride;
+
+           if (height & 1)
+           {
+               srcU += uvStride;
+               srcV += uvStride;
+           }
+
+           height--;
+       }
+    }
+
+    _mm_empty ();
+}
+
+/* TODO: MMX code for yuy2 */
+void
+fbCompositeSrc_yuy2x8888mmx (pixman_op_t      op,
+                            pixman_image_t * pSrc,
+                            pixman_image_t * pMask,
+                            pixman_image_t * pDst,
+                            int16_t      xSrc,
+                            int16_t      ySrc,
+                            int16_t      xMask,
+                            int16_t      yMask,
+                            int16_t      xDst,
+                            int16_t      yDst,
+                            uint16_t     width,
+                            uint16_t     height)
+{
+    pixman_transform_t *transform = pSrc->common.transform;
+    uint8_t      *dst, *src;
+    uint32_t     *srcBits = pSrc->bits.bits;
+    int                  srcStride;
+    uint32_t     *dstBits = pDst->bits.bits;
+    int                  dstStride;
+    int                  w;
+    uint8_t      *pd;
+
+    dst = (uint8_t *) dstBits;
+    dstStride = pDst->bits.rowstride * sizeof (uint32_t);
+
+    src = (uint8_t *) srcBits;
+    srcStride = pSrc->bits.rowstride * sizeof (uint32_t);
+
+    if (transform)
+    {
+       /* transformation is a Y coordinate flip, this is achieved by
+          moving start offsets for each plane and changing sign of stride */
+       if (transform->matrix[0][0] == (1 << 16)  &&
+           transform->matrix[1][1] == -(1 << 16) &&
+           transform->matrix[0][2] == 0          &&
+           transform->matrix[1][2] == (pSrc->bits.height << 16))
+       {
+           src = src + (pSrc->bits.height - 1) * srcStride;
+
+           srcStride = -srcStride;
+
+           transform = 0;
+       }
+    }
+
+    dst += dstStride * yDst + (xDst << 2);
+
+    if (transform)
+    {
+       ScanlineBuf slb;
+       uint8_t     _scanline_buf[8192];
+       uint8_t     *ps0, *ps1;
+       int         x, x0, y, line, xStep, yStep;
+       int         distx, idistx, disty, idisty;
+       int         srcEnd = pSrc->bits.width << 16;
+       int         srcEndIndex = (pSrc->bits.width - 1) << 16;
+
+       xStep = transform->matrix[0][0];
+       yStep = transform->matrix[1][1];
+
+       x0 = transform->matrix[0][2] + xStep * xSrc;
+       y  = transform->matrix[1][2] + yStep * ySrc;
+
+       init_scanline_buffer (&slb,
+                             _scanline_buf, sizeof (_scanline_buf),
+                             pSrc->bits.width << 2,
+                             pSrc->bits.height);
+
+       while (height--)
+       {
+           disty  = (y >> 8) & 0xff;
+           idisty = 256 - disty;
+           line   = y >> 16;
+
+           ps0 = get_scanline (&slb, line);
+           ps1 = get_scanline (&slb, line + 1);
+
+           if (!ps0)
+               ps0 = loadyuy2_scanline (&slb, line,
+                                        src, srcStride,
+                                        0, pSrc->bits.width);
+
+           if (!ps1)
+               ps1 = loadyuy2_scanline (&slb, line + 1,
+                                        src, srcStride,
+                                        0, pSrc->bits.width);
+
+           pd = dst;
+
+           x = x0;
+           w = width;
+
+           if (pSrc->common.filter == PIXMAN_FILTER_BILINEAR)
+           {
+               while (w && x < 0)
+               {
+                   *(uint32_t *) pd = fetch_bilinear_8888 (0, 256, disty, idisty,
+                                                         ps0, ps1, 0);
+
+                   x  += xStep;
+                   pd += 4;
+                   w  -= 1;
+               }
+
+               while (w && x < srcEndIndex)
+               {
+                   distx  = (x >> 8) & 0xff;
+                   idistx = 256 - distx;
+
+                   *(uint32_t *) pd = fetch_bilinear_8888 (distx, idistx,
+                                                         disty, idisty,
+                                                         ps0, ps1,
+                                                         (x >> 14) & ~3);
+
+                   x  += xStep;
+                   pd += 4;
+                   w  -= 1;
+               }
+
+               while (w)
+               {
+                   *(uint32_t *) pd = fetch_bilinear_8888 (256, 0, disty, idisty,
+                                                         ps0, ps1,
+                                                         (x >> 14) & ~3);
+
+                   pd += 4;
+                   w  -= 1;
+               }
+           }
+           else
+           {
+               while (w && x < 0)
+               {
+                   *(uint32_t *) pd = *(uint32_t *) ps0;
+
+                   x  += xStep;
+                   pd += 4;
+                   w  -= 1;
+               }
+
+               while (w && x < srcEnd)
+               {
+                   *(uint32_t *) pd = ((uint32_t *) ps0)[x >> 16];
+
+                   x  += xStep;
+                   pd += 4;
+                   w  -= 1;
+               }
+
+               while (w)
+               {
+                   *(uint32_t *) pd = ((uint32_t *) ps0)[x >> 16];
+
+                   pd += 4;
+                   w  -= 1;
+               }
+           }
+
+           y   += yStep;
+           dst += dstStride;
+
+           release_scanlines (&slb);
+       }
+
+       fini_scanline_buffer (&slb);
+    }
+    else
+    {
+       uint8_t *py, *pu, *pv;
+
+       src += srcStride * (ySrc >> 0) + xSrc;
+
+       while (height)
+       {
+           py = src;
+           pu = src + 1;
+           pv = src + 3;
+           pd = dst;
+
+           w = width;
+
+           while (w)
+           {
+               *((uint32_t *) pd) = loadyuv (py, pu, pv);
+
+               pd += 4;
+               py += 2;
+
+               if (w & 1)
+               {
+                   pu += 4;
+                   pv += 4;
+               }
+
+               w--;
+           }
+
+           dst += dstStride;
+           src += srcStride;
+
+           height--;
+       }
+    }
+}
 
 
 #endif /* USE_MMX */
index a74d4ba..a8e27e4 100644 (file)
@@ -312,4 +312,32 @@ fbCompositeOver_x888x8x8888mmx (pixman_op_t      op,
                                uint16_t     width,
                                uint16_t     height);
 
+void
+fbCompositeSrc_yv12x8888mmx (pixman_op_t      op,
+                            pixman_image_t * pSrc,
+                            pixman_image_t * pMask,
+                            pixman_image_t * pDst,
+                            int16_t    xSrc,
+                            int16_t    ySrc,
+                            int16_t    xMask,
+                            int16_t    yMask,
+                            int16_t    xDst,
+                            int16_t    yDst,
+                            uint16_t   width,
+                            uint16_t   height);
+
+void
+fbCompositeSrc_yuy2x8888mmx (pixman_op_t      op,
+                            pixman_image_t * pSrc,
+                            pixman_image_t * pMask,
+                            pixman_image_t * pDst,
+                            int16_t    xSrc,
+                            int16_t    ySrc,
+                            int16_t    xMask,
+                            int16_t    yMask,
+                            int16_t    xDst,
+                            int16_t    yDst,
+                            uint16_t   width,
+                            uint16_t   height);
+
 #endif /* USE_MMX */
index c7d73fc..d2bc3d3 100644 (file)
@@ -1437,7 +1437,35 @@ pixman_image_composite (pixman_op_t      op,
            maskTransform = FALSE;
     }
 
-    if ((pSrc->type == BITS || can_get_solid (pSrc)) && (!pMask || pMask->type == BITS)
+    /* YUV is only used internally for XVideo */
+    if (pSrc->bits.format == PIXMAN_yv12 || pSrc->bits.format == PIXMAN_yuy2)
+    {
+#ifdef USE_MMX
+       /* non rotating transformation */
+       if (!pSrc->common.transform ||
+           (pSrc->common.transform->matrix[0][1] == 0 &&
+            pSrc->common.transform->matrix[1][0] == 0 &&
+            pSrc->common.transform->matrix[2][0] == 0 &&
+            pSrc->common.transform->matrix[2][1] == 0 &&
+            pSrc->common.transform->matrix[2][2] == 1 << 16))
+       {
+           switch (pDst->bits.format) {
+           case PIXMAN_a8r8g8b8:
+           case PIXMAN_x8r8g8b8:
+               if (pixman_have_mmx())
+               {
+                   if (pSrc->bits.format == PIXMAN_yv12)
+                       func = fbCompositeSrc_yv12x8888mmx;
+                   else
+                       func = fbCompositeSrc_yuy2x8888mmx;
+               }
+           default:
+               break;
+           }
+       }
+#endif
+    }
+    else if ((pSrc->type == BITS || can_get_solid (pSrc)) && (!pMask || pMask->type == BITS)
         && !srcTransform && !maskTransform
         && !maskAlphaMap && !srcAlphaMap && !dstAlphaMap
         && (pSrc->common.filter != PIXMAN_FILTER_CONVOLUTION)