Use MAKE_ACCESSORS() to generate accessors for the a1 format.
[profile/ivi/pixman.git] / pixman / pixman-fast-path.c
index 1bdb323..bbdc8e8 100644 (file)
  * Author:  Keith Packard, SuSE, Inc.
  */
 
+#ifdef HAVE_CONFIG_H
 #include <config.h>
+#endif
 #include <string.h>
+#include <stdlib.h>
 #include "pixman-private.h"
 #include "pixman-combine32.h"
+#include "pixman-inlines.h"
 
 static force_inline uint32_t
 fetch_24 (uint8_t *a)
@@ -50,7 +54,8 @@ fetch_24 (uint8_t *a)
 }
 
 static force_inline void
-store_24 (uint8_t *a, uint32_t v)
+store_24 (uint8_t *a,
+          uint32_t v)
 {
     if (((unsigned long)a) & 1)
     {
@@ -60,7 +65,7 @@ store_24 (uint8_t *a, uint32_t v)
 #else
        *a = (uint8_t) (v);
        *(uint16_t *)(a + 1) = (uint16_t) (v >> 8);
-#endif 
+#endif
     }
     else
     {
@@ -70,26 +75,28 @@ store_24 (uint8_t *a, uint32_t v)
 #else
        *(uint16_t *)a = (uint16_t)v;
        *(a + 2) = (uint8_t)(v >> 16);
-#endif 
+#endif
     }
 }
 
 static force_inline uint32_t
-fbOver (uint32_t src, uint32_t dest)
+over (uint32_t src,
+      uint32_t dest)
 {
-    uint32_t a = ~src >> 24; 
+    uint32_t a = ~src >> 24;
 
-    FbByteMulAdd(dest, a, src);
+    UN8x4_MUL_UN8_ADD_UN8x4 (dest, a, src);
 
     return dest;
 }
 
 static uint32_t
-fbIn (uint32_t x, uint8_t y)
+in (uint32_t x,
+    uint8_t  y)
 {
-    uint16_t  a = y;
+    uint16_t a = y;
 
-    FbByteMul (x, a);
+    UN8x4_MUL_UN8 (x, a);
 
     return x;
 }
@@ -97,43 +104,33 @@ fbIn (uint32_t x, uint8_t y)
 /*
  * Naming convention:
  *
- *  opSRCxMASKxDST
+ *  op_src_mask_dest
  */
 static void
-fast_CompositeOver_x888_8_8888 (pixman_implementation_t *imp,
-                            pixman_op_t      op,
-                            pixman_image_t * src_image,
-                            pixman_image_t * mask_image,
-                            pixman_image_t * dst_image,
-                            int32_t      src_x,
-                            int32_t      src_y,
-                            int32_t      mask_x,
-                            int32_t      mask_y,
-                            int32_t      dest_x,
-                            int32_t      dest_y,
-                            int32_t     width,
-                            int32_t     height)
+fast_composite_over_x888_8_8888 (pixman_implementation_t *imp,
+                                 pixman_composite_info_t *info)
 {
-    uint32_t   *src, *srcLine;
-    uint32_t    *dst, *dstLine;
-    uint8_t    *mask, *maskLine;
-    int                 srcStride, maskStride, dstStride;
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *src, *src_line;
+    uint32_t    *dst, *dst_line;
+    uint8_t     *mask, *mask_line;
+    int src_stride, mask_stride, dst_stride;
     uint8_t m;
     uint32_t s, d;
-    uint16_t w;
+    int32_t w;
 
-    fbComposeGetStart (dst_image, dest_x, dest_y, uint32_t, dstStride, dstLine, 1);
-    fbComposeGetStart (mask_image, mask_x, mask_y, uint8_t, maskStride, maskLine, 1);
-    fbComposeGetStart (src_image, src_x, src_y, uint32_t, srcStride, srcLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
     while (height--)
     {
-       src = srcLine;
-       srcLine += srcStride;
-       dst = dstLine;
-       dstLine += dstStride;
-       mask = maskLine;
-       maskLine += maskStride;
+       src = src_line;
+       src_line += src_stride;
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
 
        w = width;
        while (w--)
@@ -144,11 +141,13 @@ fast_CompositeOver_x888_8_8888 (pixman_implementation_t *imp,
                s = *src | 0xff000000;
 
                if (m == 0xff)
+               {
                    *dst = s;
+               }
                else
                {
-                   d = fbIn (s, m);
-                   *dst = fbOver (d, *dst);
+                   d = in (s, m);
+                   *dst = over (d, *dst);
                }
            }
            src++;
@@ -158,54 +157,43 @@ fast_CompositeOver_x888_8_8888 (pixman_implementation_t *imp,
 }
 
 static void
-fast_CompositeIn_n_8_8 (pixman_implementation_t *imp,
-                             pixman_op_t      op,
-                             pixman_image_t    *iSrc,
-                             pixman_image_t    *iMask,
-                             pixman_image_t    *iDst,
-                             int32_t      src_x,
-                             int32_t      src_y,
-                             int32_t      mask_x,
-                             int32_t      mask_y,
-                             int32_t      dest_x,
-                             int32_t      dest_y,
-                             int32_t     width,
-                             int32_t     height)
+fast_composite_in_n_8_8 (pixman_implementation_t *imp,
+                         pixman_composite_info_t *info)
 {
-    uint32_t   src, srca;
-    uint8_t    *dstLine, *dst;
-    uint8_t    *maskLine, *mask, m;
-    int        dstStride, maskStride;
-    uint16_t   w;
-    uint16_t    t;
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint16_t t;
 
-    src = _pixman_image_get_solid(iSrc, iDst->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     srca = src >> 24;
 
-    fbComposeGetStart (iDst, dest_x, dest_y, uint8_t, dstStride, dstLine, 1);
-    fbComposeGetStart (iMask, mask_x, mask_y, uint8_t, maskStride, maskLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
-    if (srca == 0xff) {
+    if (srca == 0xff)
+    {
        while (height--)
        {
-           dst = dstLine;
-           dstLine += dstStride;
-           mask = maskLine;
-           maskLine += maskStride;
+           dst = dst_line;
+           dst_line += dst_stride;
+           mask = mask_line;
+           mask_line += mask_stride;
            w = width;
 
            while (w--)
            {
                m = *mask++;
+
                if (m == 0)
-               {
                    *dst = 0;
-               }
                else if (m != 0xff)
-               {
-                   *dst = IntMult(m, *dst, t);
-               }
+                   *dst = MUL_UN8 (m, *dst, t);
+
                dst++;
            }
        }
@@ -214,116 +202,91 @@ fast_CompositeIn_n_8_8 (pixman_implementation_t *imp,
     {
        while (height--)
        {
-           dst = dstLine;
-           dstLine += dstStride;
-           mask = maskLine;
-           maskLine += maskStride;
+           dst = dst_line;
+           dst_line += dst_stride;
+           mask = mask_line;
+           mask_line += mask_stride;
            w = width;
 
            while (w--)
            {
                m = *mask++;
-               m = IntMult(m, srca, t);
+               m = MUL_UN8 (m, srca, t);
+
                if (m == 0)
-               {
                    *dst = 0;
-               }
                else if (m != 0xff)
-               {
-                   *dst = IntMult(m, *dst, t);
-               }
+                   *dst = MUL_UN8 (m, *dst, t);
+
                dst++;
            }
        }
     }
 }
 
-
 static void
-fast_CompositeIn_8_8 (pixman_implementation_t *imp,
-                     pixman_op_t      op,
-                     pixman_image_t  *iSrc,
-                     pixman_image_t  *iMask,
-                     pixman_image_t  *iDst,
-                     int32_t          src_x,
-                     int32_t          src_y,
-                     int32_t          mask_x,
-                     int32_t          mask_y,
-                     int32_t          dest_x,
-                     int32_t          dest_y,
-                     int32_t         width,
-                     int32_t         height)
+fast_composite_in_8_8 (pixman_implementation_t *imp,
+                       pixman_composite_info_t *info)
 {
-    uint8_t    *dstLine, *dst;
-    uint8_t    *srcLine, *src;
-    int        dstStride, srcStride;
-    uint16_t   w;
-    uint8_t    s;
-    uint16_t   t;
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint8_t s;
+    uint16_t t;
 
-    fbComposeGetStart (iSrc, src_x, src_y, uint8_t, srcStride, srcLine, 1);
-    fbComposeGetStart (iDst, dest_x, dest_y, uint8_t, dstStride, dstLine, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
 
     while (height--)
     {
-       dst = dstLine;
-       dstLine += dstStride;
-       src = srcLine;
-       srcLine += srcStride;
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
        w = width;
 
        while (w--)
        {
            s = *src++;
+
            if (s == 0)
-           {
                *dst = 0;
-           }
            else if (s != 0xff)
-           {
-               *dst = IntMult(s, *dst, t);
-           }
+               *dst = MUL_UN8 (s, *dst, t);
+
            dst++;
        }
     }
 }
 
 static void
-fast_CompositeOver_n_8_8888 (pixman_implementation_t *imp,
-                              pixman_op_t      op,
-                              pixman_image_t * src_image,
-                              pixman_image_t * mask_image,
-                              pixman_image_t * dst_image,
-                              int32_t      src_x,
-                              int32_t      src_y,
-                              int32_t      mask_x,
-                              int32_t      mask_y,
-                              int32_t      dest_x,
-                              int32_t      dest_y,
-                              int32_t     width,
-                              int32_t     height)
+fast_composite_over_n_8_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
 {
-    uint32_t    src, srca;
-    uint32_t   *dstLine, *dst, d;
-    uint8_t    *maskLine, *mask, m;
-    int                 dstStride, maskStride;
-    uint16_t    w;
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint32_t    *dst_line, *dst, d;
+    uint8_t     *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    int32_t w;
 
-    src = _pixman_image_get_solid(src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     srca = src >> 24;
     if (src == 0)
        return;
 
-    fbComposeGetStart (dst_image, dest_x, dest_y, uint32_t, dstStride, dstLine, 1);
-    fbComposeGetStart (mask_image, mask_x, mask_y, uint8_t, maskStride, maskLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
     while (height--)
     {
-       dst = dstLine;
-       dstLine += dstStride;
-       mask = maskLine;
-       maskLine += maskStride;
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
        w = width;
 
        while (w--)
@@ -334,12 +297,12 @@ fast_CompositeOver_n_8_8888 (pixman_implementation_t *imp,
                if (srca == 0xff)
                    *dst = src;
                else
-                   *dst = fbOver (src, *dst);
+                   *dst = over (src, *dst);
            }
            else if (m)
            {
-               d = fbIn (src, m);
-               *dst = fbOver (d, *dst);
+               d = in (src, m);
+               *dst = over (d, *dst);
            }
            dst++;
        }
@@ -347,41 +310,77 @@ fast_CompositeOver_n_8_8888 (pixman_implementation_t *imp,
 }
 
 static void
-fast_CompositeOver_n_8888_8888_ca (pixman_implementation_t *imp,
-                                  pixman_op_t op,
-                                  pixman_image_t * src_image,
-                                  pixman_image_t * mask_image,
-                                  pixman_image_t * dst_image,
-                                  int32_t      src_x,
-                                  int32_t      src_y,
-                                  int32_t      mask_x,
-                                  int32_t      mask_y,
-                                  int32_t      dest_x,
-                                  int32_t      dest_y,
-                                  int32_t     width,
-                                  int32_t     height)
+fast_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
+                                  pixman_composite_info_t *info)
 {
-    uint32_t   src, srca;
-    uint32_t   *dstLine, *dst, d;
-    uint32_t   *maskLine, *mask, ma;
-    int        dstStride, maskStride;
-    uint16_t   w;
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, s;
+    uint32_t    *dst_line, *dst, d;
+    uint32_t    *mask_line, *mask, ma;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+       return;
 
-    src = _pixman_image_get_solid(src_image, dst_image->bits.format);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
+       w = width;
+
+       while (w--)
+       {
+           ma = *mask++;
+
+           if (ma)
+           {
+               d = *dst;
+               s = src;
+
+               UN8x4_MUL_UN8x4_ADD_UN8x4 (s, ma, d);
+
+               *dst = s;
+           }
+
+           dst++;
+       }
+    }
+}
+
+static void
+fast_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+                                    pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca, s;
+    uint32_t    *dst_line, *dst, d;
+    uint32_t    *mask_line, *mask, ma;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     srca = src >> 24;
     if (src == 0)
        return;
 
-    fbComposeGetStart (dst_image, dest_x, dest_y, uint32_t, dstStride, dstLine, 1);
-    fbComposeGetStart (mask_image, mask_x, mask_y, uint32_t, maskStride, maskLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
 
     while (height--)
     {
-       dst = dstLine;
-       dstLine += dstStride;
-       mask = maskLine;
-       maskLine += maskStride;
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
        w = width;
 
        while (w--)
@@ -392,16 +391,17 @@ fast_CompositeOver_n_8888_8888_ca (pixman_implementation_t *imp,
                if (srca == 0xff)
                    *dst = src;
                else
-                   *dst = fbOver (src, *dst);
+                   *dst = over (src, *dst);
            }
            else if (ma)
            {
                d = *dst;
+               s = src;
 
-               FbByteMulC (src, ma);
-               FbByteMul (ma, srca);
+               UN8x4_MUL_UN8x4 (s, ma);
+               UN8x4_MUL_UN8 (ma, srca);
                ma = ~ma;
-               FbByteMulAddC (d, ma, src);
+               UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
 
                *dst = d;
            }
@@ -412,42 +412,32 @@ fast_CompositeOver_n_8888_8888_ca (pixman_implementation_t *imp,
 }
 
 static void
-fast_CompositeOver_n_8_0888 (pixman_implementation_t *imp,
-                              pixman_op_t op,
-                              pixman_image_t * src_image,
-                              pixman_image_t * mask_image,
-                              pixman_image_t * dst_image,
-                              int32_t      src_x,
-                              int32_t      src_y,
-                              int32_t      mask_x,
-                              int32_t      mask_y,
-                              int32_t      dest_x,
-                              int32_t      dest_y,
-                              int32_t     width,
-                              int32_t     height)
+fast_composite_over_n_8_0888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
 {
-    uint32_t   src, srca;
-    uint8_t    *dstLine, *dst;
-    uint32_t   d;
-    uint8_t    *maskLine, *mask, m;
-    int        dstStride, maskStride;
-    uint16_t   w;
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint8_t     *dst_line, *dst;
+    uint32_t d;
+    uint8_t     *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    int32_t w;
 
-    src = _pixman_image_get_solid(src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     srca = src >> 24;
     if (src == 0)
        return;
 
-    fbComposeGetStart (dst_image, dest_x, dest_y, uint8_t, dstStride, dstLine, 3);
-    fbComposeGetStart (mask_image, mask_x, mask_y, uint8_t, maskStride, maskLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
     while (height--)
     {
-       dst = dstLine;
-       dstLine += dstStride;
-       mask = maskLine;
-       maskLine += maskStride;
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
        w = width;
 
        while (w--)
@@ -456,18 +446,20 @@ fast_CompositeOver_n_8_0888 (pixman_implementation_t *imp,
            if (m == 0xff)
            {
                if (srca == 0xff)
+               {
                    d = src;
+               }
                else
                {
-                   d = fetch_24(dst);
-                   d = fbOver (src, d);
+                   d = fetch_24 (dst);
+                   d = over (src, d);
                }
-               store_24(dst, d);
+               store_24 (dst, d);
            }
            else if (m)
            {
-               d = fbOver (fbIn(src,m), fetch_24(dst));
-               store_24(dst, d);
+               d = over (in (src, m), fetch_24 (dst));
+               store_24 (dst, d);
            }
            dst += 3;
        }
@@ -475,42 +467,32 @@ fast_CompositeOver_n_8_0888 (pixman_implementation_t *imp,
 }
 
 static void
-fast_CompositeOver_n_8_0565 (pixman_implementation_t *imp,
-                              pixman_op_t op,
-                                 pixman_image_t * src_image,
-                                 pixman_image_t * mask_image,
-                                 pixman_image_t * dst_image,
-                                 int32_t      src_x,
-                                 int32_t      src_y,
-                                 int32_t      mask_x,
-                                 int32_t      mask_y,
-                                 int32_t      dest_x,
-                                 int32_t      dest_y,
-                                 int32_t     width,
-                                 int32_t     height)
+fast_composite_over_n_8_0565 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
 {
-    uint32_t   src, srca;
-    uint16_t   *dstLine, *dst;
-    uint32_t   d;
-    uint8_t    *maskLine, *mask, m;
-    int        dstStride, maskStride;
-    uint16_t   w;
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint16_t    *dst_line, *dst;
+    uint32_t d;
+    uint8_t     *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    int32_t w;
 
-    src = _pixman_image_get_solid(src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     srca = src >> 24;
     if (src == 0)
        return;
 
-    fbComposeGetStart (dst_image, dest_x, dest_y, uint16_t, dstStride, dstLine, 1);
-    fbComposeGetStart (mask_image, mask_x, mask_y, uint8_t, maskStride, maskLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
     while (height--)
     {
-       dst = dstLine;
-       dstLine += dstStride;
-       mask = maskLine;
-       maskLine += maskStride;
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
        w = width;
 
        while (w--)
@@ -519,19 +501,21 @@ fast_CompositeOver_n_8_0565 (pixman_implementation_t *imp,
            if (m == 0xff)
            {
                if (srca == 0xff)
+               {
                    d = src;
+               }
                else
                {
                    d = *dst;
-                   d = fbOver (src, CONVERT_0565_TO_0888(d));
+                   d = over (src, CONVERT_0565_TO_0888 (d));
                }
-               *dst = CONVERT_8888_TO_0565(d);
+               *dst = CONVERT_8888_TO_0565 (d);
            }
            else if (m)
            {
                d = *dst;
-               d = fbOver (fbIn(src,m), CONVERT_0565_TO_0888(d));
-               *dst = CONVERT_8888_TO_0565(d);
+               d = over (in (src, m), CONVERT_0565_TO_0888 (d));
+               *dst = CONVERT_8888_TO_0565 (d);
            }
            dst++;
        }
@@ -539,45 +523,35 @@ fast_CompositeOver_n_8_0565 (pixman_implementation_t *imp,
 }
 
 static void
-fast_CompositeOver_n_8888_0565_ca (pixman_implementation_t *imp,
-                                  pixman_op_t op,
-                                  pixman_image_t * src_image,
-                                  pixman_image_t * mask_image,
-                                  pixman_image_t * dst_image,
-                                  int32_t      src_x,
-                                  int32_t      src_y,
-                                  int32_t      mask_x,
-                                  int32_t      mask_y,
-                                  int32_t      dest_x,
-                                  int32_t      dest_y,
-                                  int32_t     width,
-                                  int32_t     height)
+fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
+                                    pixman_composite_info_t *info)
 {
-    uint32_t   src, srca;
-    uint16_t   src16;
-    uint16_t   *dstLine, *dst;
-    uint32_t   d;
-    uint32_t   *maskLine, *mask, ma;
-    int        dstStride, maskStride;
-    uint16_t   w;
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t  src, srca, s;
+    uint16_t  src16;
+    uint16_t *dst_line, *dst;
+    uint32_t  d;
+    uint32_t *mask_line, *mask, ma;
+    int dst_stride, mask_stride;
+    int32_t w;
 
-    src = _pixman_image_get_solid(src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     srca = src >> 24;
     if (src == 0)
        return;
 
-    src16 = CONVERT_8888_TO_0565(src);
+    src16 = CONVERT_8888_TO_0565 (src);
 
-    fbComposeGetStart (dst_image, dest_x, dest_y, uint16_t, dstStride, dstLine, 1);
-    fbComposeGetStart (mask_image, mask_x, mask_y, uint32_t, maskStride, maskLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
 
     while (height--)
     {
-       dst = dstLine;
-       dstLine += dstStride;
-       mask = maskLine;
-       maskLine += maskStride;
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
        w = width;
 
        while (w--)
@@ -592,21 +566,23 @@ fast_CompositeOver_n_8888_0565_ca (pixman_implementation_t *imp,
                else
                {
                    d = *dst;
-                   d = fbOver (src, CONVERT_0565_TO_0888(d));
-                   *dst = CONVERT_8888_TO_0565(d);
+                   d = over (src, CONVERT_0565_TO_0888 (d));
+                   *dst = CONVERT_8888_TO_0565 (d);
                }
            }
            else if (ma)
            {
                d = *dst;
-               d = CONVERT_0565_TO_0888(d);
+               d = CONVERT_0565_TO_0888 (d);
 
-               FbByteMulC (src, ma);
-               FbByteMul (ma, srca);
+               s = src;
+
+               UN8x4_MUL_UN8x4 (s, ma);
+               UN8x4_MUL_UN8 (ma, srca);
                ma = ~ma;
-               FbByteMulAddC (d, ma, src);
-               
-               *dst = CONVERT_8888_TO_0565(d);
+               UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
+
+               *dst = CONVERT_8888_TO_0565 (d);
            }
            dst++;
        }
@@ -615,34 +591,24 @@ fast_CompositeOver_n_8888_0565_ca (pixman_implementation_t *imp,
 
 static void
 fast_composite_over_8888_8888 (pixman_implementation_t *imp,
-                         pixman_op_t op,
-                        pixman_image_t * src_image,
-                        pixman_image_t * mask_image,
-                        pixman_image_t * dst_image,
-                        int32_t      src_x,
-                        int32_t      src_y,
-                        int32_t      mask_x,
-                        int32_t      mask_y,
-                        int32_t      dest_x,
-                        int32_t      dest_y,
-                        int32_t     width,
-                        int32_t     height)
+                               pixman_composite_info_t *info)
 {
-    uint32_t   *dstLine, *dst;
-    uint32_t   *srcLine, *src, s;
-    int        dstStride, srcStride;
-    uint8_t    a;
-    uint16_t   w;
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    uint8_t a;
+    int32_t w;
 
-    fbComposeGetStart (dst_image, dest_x, dest_y, uint32_t, dstStride, dstLine, 1);
-    fbComposeGetStart (src_image, src_x, src_y, uint32_t, srcStride, srcLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
     while (height--)
     {
-       dst = dstLine;
-       dstLine += dstStride;
-       src = srcLine;
-       srcLine += srcStride;
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
        w = width;
 
        while (w--)
@@ -652,43 +618,60 @@ fast_composite_over_8888_8888 (pixman_implementation_t *imp,
            if (a == 0xff)
                *dst = s;
            else if (s)
-               *dst = fbOver (s, *dst);
+               *dst = over (s, *dst);
            dst++;
        }
     }
 }
 
 static void
-fast_CompositeSrc_8888_0888 (pixman_implementation_t *imp,
-                         pixman_op_t op,
-                        pixman_image_t * src_image,
-                        pixman_image_t * mask_image,
-                        pixman_image_t * dst_image,
-                        int32_t      src_x,
-                        int32_t      src_y,
-                        int32_t      mask_x,
-                        int32_t      mask_y,
-                        int32_t      dest_x,
-                        int32_t      dest_y,
-                        int32_t     width,
-                        int32_t     height)
+fast_composite_src_x888_8888 (pixman_implementation_t *imp,
+                             pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       while (w--)
+           *dst++ = (*src++) | 0xff000000;
+    }
+}
+
+#if 0
+static void
+fast_composite_over_8888_0888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
 {
-    uint8_t    *dstLine, *dst;
-    uint32_t   d;
-    uint32_t   *srcLine, *src, s;
-    uint8_t    a;
-    int        dstStride, srcStride;
-    uint16_t   w;
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint32_t d;
+    uint32_t    *src_line, *src, s;
+    uint8_t a;
+    int dst_stride, src_stride;
+    int32_t w;
 
-    fbComposeGetStart (dst_image, dest_x, dest_y, uint8_t, dstStride, dstLine, 3);
-    fbComposeGetStart (src_image, src_x, src_y, uint32_t, srcStride, srcLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
     while (height--)
     {
-       dst = dstLine;
-       dstLine += dstStride;
-       src = srcLine;
-       srcLine += srcStride;
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
        w = width;
 
        while (w--)
@@ -700,46 +683,37 @@ fast_CompositeSrc_8888_0888 (pixman_implementation_t *imp,
                if (a == 0xff)
                    d = s;
                else
-                   d = fbOver (s, fetch_24(dst));
+                   d = over (s, fetch_24 (dst));
 
-               store_24(dst, d);
+               store_24 (dst, d);
            }
            dst += 3;
        }
     }
 }
+#endif
 
 static void
 fast_composite_over_8888_0565 (pixman_implementation_t *imp,
-                         pixman_op_t op,
-                        pixman_image_t * src_image,
-                        pixman_image_t * mask_image,
-                        pixman_image_t * dst_image,
-                        int32_t      src_x,
-                        int32_t      src_y,
-                        int32_t      mask_x,
-                        int32_t      mask_y,
-                        int32_t      dest_x,
-                        int32_t      dest_y,
-                        int32_t     width,
-                        int32_t     height)
+                               pixman_composite_info_t *info)
 {
-    uint16_t   *dstLine, *dst;
-    uint32_t   d;
-    uint32_t   *srcLine, *src, s;
-    uint8_t    a;
-    int        dstStride, srcStride;
-    uint16_t   w;
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst;
+    uint32_t d;
+    uint32_t    *src_line, *src, s;
+    uint8_t a;
+    int dst_stride, src_stride;
+    int32_t w;
 
-    fbComposeGetStart (src_image, src_x, src_y, uint32_t, srcStride, srcLine, 1);
-    fbComposeGetStart (dst_image, dest_x, dest_y, uint16_t, dstStride, dstLine, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
 
     while (height--)
     {
-       dst = dstLine;
-       dstLine += dstStride;
-       src = srcLine;
-       srcLine += srcStride;
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
        w = width;
 
        while (w--)
@@ -749,13 +723,15 @@ fast_composite_over_8888_0565 (pixman_implementation_t *imp,
            if (s)
            {
                if (a == 0xff)
+               {
                    d = s;
+               }
                else
                {
                    d = *dst;
-                   d = fbOver (s, CONVERT_0565_TO_0888(d));
+                   d = over (s, CONVERT_0565_TO_0888 (d));
                }
-               *dst = CONVERT_8888_TO_0565(d);
+               *dst = CONVERT_8888_TO_0565 (d);
            }
            dst++;
        }
@@ -763,76 +739,56 @@ fast_composite_over_8888_0565 (pixman_implementation_t *imp,
 }
 
 static void
-fast_CompositeSrc_x888_0565 (pixman_implementation_t *imp,
-                         pixman_op_t op,
-                          pixman_image_t * src_image,
-                          pixman_image_t * mask_image,
-                          pixman_image_t * dst_image,
-                          int32_t      src_x,
-                          int32_t      src_y,
-                          int32_t      mask_x,
-                          int32_t      mask_y,
-                          int32_t      dest_x,
-                          int32_t      dest_y,
-                          int32_t     width,
-                          int32_t     height)
+fast_composite_src_x888_0565 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
 {
-    uint16_t   *dstLine, *dst;
-    uint32_t   *srcLine, *src, s;
-    int        dstStride, srcStride;
-    uint16_t   w;
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    int32_t w;
 
-    fbComposeGetStart (src_image, src_x, src_y, uint32_t, srcStride, srcLine, 1);
-    fbComposeGetStart (dst_image, dest_x, dest_y, uint16_t, dstStride, dstLine, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
 
     while (height--)
     {
-       dst = dstLine;
-       dstLine += dstStride;
-       src = srcLine;
-       srcLine += srcStride;
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
        w = width;
 
        while (w--)
        {
            s = *src++;
-           *dst = CONVERT_8888_TO_0565(s);
+           *dst = CONVERT_8888_TO_0565 (s);
            dst++;
        }
     }
 }
 
 static void
-fast_CompositeAdd_8000_8000 (pixman_implementation_t *imp,
-                            pixman_op_t        op,
-                            pixman_image_t * src_image,
-                            pixman_image_t * mask_image,
-                            pixman_image_t * dst_image,
-                            int32_t      src_x,
-                            int32_t      src_y,
-                            int32_t      mask_x,
-                            int32_t      mask_y,
-                            int32_t      dest_x,
-                            int32_t      dest_y,
-                            int32_t     width,
-                            int32_t     height)
+fast_composite_add_8_8 (pixman_implementation_t *imp,
+                       pixman_composite_info_t *info)
 {
-    uint8_t    *dstLine, *dst;
-    uint8_t    *srcLine, *src;
-    int        dstStride, srcStride;
-    uint16_t   w;
-    uint8_t    s, d;
-    uint16_t   t;
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint8_t s, d;
+    uint16_t t;
 
-    fbComposeGetStart (src_image, src_x, src_y, uint8_t, srcStride, srcLine, 1);
-    fbComposeGetStart (dst_image, dest_x, dest_y, uint8_t, dstStride, dstLine, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
 
     while (height--)
     {
-       dst = dstLine;
-       dstLine += dstStride;
-       src = srcLine;
-       srcLine += srcStride;
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
        w = width;
 
        while (w--)
@@ -854,35 +810,25 @@ fast_CompositeAdd_8000_8000 (pixman_implementation_t *imp,
 }
 
 static void
-fast_CompositeAdd_8888_8888 (pixman_implementation_t *imp,
-                            pixman_op_t        op,
-                            pixman_image_t * src_image,
-                            pixman_image_t * mask_image,
-                            pixman_image_t * dst_image,
-                            int32_t      src_x,
-                            int32_t      src_y,
-                            int32_t      mask_x,
-                            int32_t      mask_y,
-                            int32_t      dest_x,
-                            int32_t      dest_y,
-                            int32_t     width,
-                            int32_t     height)
+fast_composite_add_8888_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
 {
-    uint32_t   *dstLine, *dst;
-    uint32_t   *srcLine, *src;
-    int        dstStride, srcStride;
-    uint16_t   w;
-    uint32_t   s, d;
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint32_t s, d;
 
-    fbComposeGetStart (src_image, src_x, src_y, uint32_t, srcStride, srcLine, 1);
-    fbComposeGetStart (dst_image, dest_x, dest_y, uint32_t, dstStride, dstLine, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 
     while (height--)
     {
-       dst = dstLine;
-       dstLine += dstStride;
-       src = srcLine;
-       srcLine += srcStride;
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
        w = width;
 
        while (w--)
@@ -894,7 +840,7 @@ fast_CompositeAdd_8888_8888 (pixman_implementation_t *imp,
                {
                    d = *dst;
                    if (d)
-                       FbByteAdd(s,d);
+                       UN8x4_ADD_UN8x4 (s, d);
                }
                *dst = s;
            }
@@ -904,342 +850,1030 @@ fast_CompositeAdd_8888_8888 (pixman_implementation_t *imp,
 }
 
 static void
-fast_CompositeAdd_8888_8_8 (pixman_implementation_t *imp,
-                           pixman_op_t op,
-                           pixman_image_t * src_image,
-                           pixman_image_t * mask_image,
-                           pixman_image_t * dst_image,
-                           int32_t      src_x,
-                           int32_t      src_y,
-                           int32_t      mask_x,
-                           int32_t      mask_y,
-                           int32_t      dest_x,
-                           int32_t      dest_y,
-                           int32_t     width,
-                           int32_t     height)
+fast_composite_add_n_8_8 (pixman_implementation_t *imp,
+                         pixman_composite_info_t *info)
 {
-    uint8_t    *dstLine, *dst;
-    uint8_t    *maskLine, *mask;
-    int        dstStride, maskStride;
-    uint16_t   w;
-    uint32_t   src;
-    uint8_t    sa;
-
-    fbComposeGetStart (dst_image, dest_x, dest_y, uint8_t, dstStride, dstLine, 1);
-    fbComposeGetStart (mask_image, mask_x, mask_y, uint8_t, maskStride, maskLine, 1);
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t src;
+    uint8_t sa;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
     sa = (src >> 24);
 
     while (height--)
     {
-       dst = dstLine;
-       dstLine += dstStride;
-       mask = maskLine;
-       maskLine += maskStride;
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
        w = width;
 
        while (w--)
        {
-           uint16_t    tmp;
-           uint16_t    a;
-           uint32_t    m, d;
-           uint32_t    r;
+           uint16_t tmp;
+           uint16_t a;
+           uint32_t m, d;
+           uint32_t r;
 
            a = *mask++;
            d = *dst;
 
-           m = IntMult (sa, a, tmp);
-           r = IntAdd (m, d, tmp);
+           m = MUL_UN8 (sa, a, tmp);
+           r = ADD_UN8 (m, d, tmp);
 
            *dst++ = r;
        }
     }
 }
 
+#ifdef WORDS_BIGENDIAN
+#define CREATE_BITMASK(n) (0x80000000 >> (n))
+#define UPDATE_BITMASK(n) ((n) >> 1)
+#else
+#define CREATE_BITMASK(n) (1 << (n))
+#define UPDATE_BITMASK(n) ((n) << 1)
+#endif
+
+#define TEST_BIT(p, n)                                 \
+    (*((p) + ((n) >> 5)) & CREATE_BITMASK ((n) & 31))
+#define SET_BIT(p, n)                                                  \
+    do { *((p) + ((n) >> 5)) |= CREATE_BITMASK ((n) & 31); } while (0);
+
+static void
+fast_composite_add_1000_1000 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t     *dst_line, *dst;
+    uint32_t     *src_line, *src;
+    int           dst_stride, src_stride;
+    int32_t       w;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, src_y, uint32_t,
+                           src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, 0, dest_y, uint32_t,
+                           dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       while (w--)
+       {
+           /*
+            * TODO: improve performance by processing uint32_t data instead
+            *       of individual bits
+            */
+           if (TEST_BIT (src, src_x + w))
+               SET_BIT (dst, dest_x + w);
+       }
+    }
+}
+
+static void
+fast_composite_over_n_1_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t     src, srca;
+    uint32_t    *dst, *dst_line;
+    uint32_t    *mask, *mask_line;
+    int          mask_stride, dst_stride;
+    uint32_t     bitcache, bitmask;
+    int32_t      w;
+
+    if (width <= 0)
+       return;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    srca = src >> 24;
+    if (src == 0)
+       return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t,
+                           dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
+                           mask_stride, mask_line, 1);
+    mask_line += mask_x >> 5;
+
+    if (srca == 0xff)
+    {
+       while (height--)
+       {
+           dst = dst_line;
+           dst_line += dst_stride;
+           mask = mask_line;
+           mask_line += mask_stride;
+           w = width;
+
+           bitcache = *mask++;
+           bitmask = CREATE_BITMASK (mask_x & 31);
+
+           while (w--)
+           {
+               if (bitmask == 0)
+               {
+                   bitcache = *mask++;
+                   bitmask = CREATE_BITMASK (0);
+               }
+               if (bitcache & bitmask)
+                   *dst = src;
+               bitmask = UPDATE_BITMASK (bitmask);
+               dst++;
+           }
+       }
+    }
+    else
+    {
+       while (height--)
+       {
+           dst = dst_line;
+           dst_line += dst_stride;
+           mask = mask_line;
+           mask_line += mask_stride;
+           w = width;
+
+           bitcache = *mask++;
+           bitmask = CREATE_BITMASK (mask_x & 31);
+
+           while (w--)
+           {
+               if (bitmask == 0)
+               {
+                   bitcache = *mask++;
+                   bitmask = CREATE_BITMASK (0);
+               }
+               if (bitcache & bitmask)
+                   *dst = over (src, *dst);
+               bitmask = UPDATE_BITMASK (bitmask);
+               dst++;
+           }
+       }
+    }
+}
+
+static void
+fast_composite_over_n_1_0565 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t     src, srca;
+    uint16_t    *dst, *dst_line;
+    uint32_t    *mask, *mask_line;
+    int          mask_stride, dst_stride;
+    uint32_t     bitcache, bitmask;
+    int32_t      w;
+    uint32_t     d;
+    uint16_t     src565;
+
+    if (width <= 0)
+       return;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    srca = src >> 24;
+    if (src == 0)
+       return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t,
+                           dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
+                           mask_stride, mask_line, 1);
+    mask_line += mask_x >> 5;
+
+    if (srca == 0xff)
+    {
+       src565 = CONVERT_8888_TO_0565 (src);
+       while (height--)
+       {
+           dst = dst_line;
+           dst_line += dst_stride;
+           mask = mask_line;
+           mask_line += mask_stride;
+           w = width;
+
+           bitcache = *mask++;
+           bitmask = CREATE_BITMASK (mask_x & 31);
+
+           while (w--)
+           {
+               if (bitmask == 0)
+               {
+                   bitcache = *mask++;
+                   bitmask = CREATE_BITMASK (0);
+               }
+               if (bitcache & bitmask)
+                   *dst = src565;
+               bitmask = UPDATE_BITMASK (bitmask);
+               dst++;
+           }
+       }
+    }
+    else
+    {
+       while (height--)
+       {
+           dst = dst_line;
+           dst_line += dst_stride;
+           mask = mask_line;
+           mask_line += mask_stride;
+           w = width;
+
+           bitcache = *mask++;
+           bitmask = CREATE_BITMASK (mask_x & 31);
+
+           while (w--)
+           {
+               if (bitmask == 0)
+               {
+                   bitcache = *mask++;
+                   bitmask = CREATE_BITMASK (0);
+               }
+               if (bitcache & bitmask)
+               {
+                   d = over (src, CONVERT_0565_TO_0888 (*dst));
+                   *dst = CONVERT_8888_TO_0565 (d);
+               }
+               bitmask = UPDATE_BITMASK (bitmask);
+               dst++;
+           }
+       }
+    }
+}
+
 /*
  * Simple bitblt
  */
 
 static void
-fast_CompositeSolidFill (pixman_implementation_t *imp,
-                     pixman_op_t op,
-                     pixman_image_t * src_image,
-                     pixman_image_t * mask_image,
-                     pixman_image_t * dst_image,
-                     int32_t      src_x,
-                     int32_t      src_y,
-                     int32_t      mask_x,
-                     int32_t      mask_y,
-                     int32_t      dest_x,
-                     int32_t      dest_y,
-                     int32_t     width,
-                     int32_t     height)
+fast_composite_solid_fill (pixman_implementation_t *imp,
+                           pixman_composite_info_t *info)
 {
-    uint32_t   src;
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
 
-    src = _pixman_image_get_solid(src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
-    if (dst_image->bits.format == PIXMAN_a8)
+    if (dest_image->bits.format == PIXMAN_a1)
+    {
+       src = src >> 31;
+    }
+    else if (dest_image->bits.format == PIXMAN_a8)
+    {
        src = src >> 24;
-    else if (dst_image->bits.format == PIXMAN_r5g6b5 ||
-            dst_image->bits.format == PIXMAN_b5g6r5)
+    }
+    else if (dest_image->bits.format == PIXMAN_r5g6b5 ||
+             dest_image->bits.format == PIXMAN_b5g6r5)
+    {
        src = CONVERT_8888_TO_0565 (src);
+    }
 
-    pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
-                PIXMAN_FORMAT_BPP (dst_image->bits.format),
-                dest_x, dest_y,
-                width, height,
-                src);
+    pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
+                 PIXMAN_FORMAT_BPP (dest_image->bits.format),
+                 dest_x, dest_y,
+                 width, height,
+                 src);
 }
 
 static void
-fast_CompositeSrc_8888_x888 (pixman_implementation_t *imp,
-                         pixman_op_t op,
-                         pixman_image_t * src_image,
-                         pixman_image_t * mask_image,
-                         pixman_image_t * dst_image,
-                         int32_t      src_x,
-                         int32_t      src_y,
-                         int32_t      mask_x,
-                         int32_t      mask_y,
-                         int32_t      dest_x,
-                         int32_t      dest_y,
-                         int32_t     width,
-                         int32_t     height)
+fast_composite_src_memcpy (pixman_implementation_t *imp,
+                          pixman_composite_info_t *info)
 {
-    uint32_t   *dst;
-    uint32_t    *src;
-    int                 dstStride, srcStride;
-    uint32_t    n_bytes = width * sizeof (uint32_t);
+    PIXMAN_COMPOSITE_ARGS (info);
+    int bpp = PIXMAN_FORMAT_BPP (dest_image->bits.format) / 8;
+    uint32_t n_bytes = width * bpp;
+    int dst_stride, src_stride;
+    uint8_t    *dst;
+    uint8_t    *src;
 
-    fbComposeGetStart (src_image, src_x, src_y, uint32_t, srcStride, src, 1);
-    fbComposeGetStart (dst_image, dest_x, dest_y, uint32_t, dstStride, dst, 1);
+    src_stride = src_image->bits.rowstride * 4;
+    dst_stride = dest_image->bits.rowstride * 4;
+
+    src = (uint8_t *)src_image->bits.bits + src_y * src_stride + src_x * bpp;
+    dst = (uint8_t *)dest_image->bits.bits + dest_y * dst_stride + dest_x * bpp;
 
     while (height--)
     {
        memcpy (dst, src, n_bytes);
 
-       dst += dstStride;
-       src += srcStride;
+       dst += dst_stride;
+       src += src_stride;
     }
 }
 
-static const pixman_fast_path_t c_fast_paths[] =
+FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, SRC, COVER)
+FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, SRC, NONE)
+FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, SRC, PAD)
+FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, SRC, NORMAL)
+FAST_NEAREST (x888_8888_cover, x888, 8888, uint32_t, uint32_t, SRC, COVER)
+FAST_NEAREST (x888_8888_pad, x888, 8888, uint32_t, uint32_t, SRC, PAD)
+FAST_NEAREST (x888_8888_normal, x888, 8888, uint32_t, uint32_t, SRC, NORMAL)
+FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, OVER, COVER)
+FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, OVER, NONE)
+FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, OVER, PAD)
+FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, OVER, NORMAL)
+FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, SRC, COVER)
+FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, NONE)
+FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD)
+FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL)
+FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL)
+FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER)
+FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE)
+FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD)
+FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL)
+
+/* Use more unrolling for src_0565_0565 because it is typically CPU bound */
+static force_inline void
+scaled_nearest_scanline_565_565_SRC (uint16_t *       dst,
+                                    const uint16_t * src,
+                                    int32_t          w,
+                                    pixman_fixed_t   vx,
+                                    pixman_fixed_t   unit_x,
+                                    pixman_fixed_t   max_vx,
+                                    pixman_bool_t    fully_transparent_src)
 {
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   fast_CompositeOver_n_8_0565, 0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   fast_CompositeOver_n_8_0565, 0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r8g8b8,   fast_CompositeOver_n_8_0888, 0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b8g8r8,   fast_CompositeOver_n_8_0888, 0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, fast_CompositeOver_n_8_8888, 0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, fast_CompositeOver_n_8_8888, 0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, fast_CompositeOver_n_8_8888, 0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, fast_CompositeOver_n_8_8888, 0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, fast_CompositeOver_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, fast_CompositeOver_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_r5g6b5,   fast_CompositeOver_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, fast_CompositeOver_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, fast_CompositeOver_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_b5g6r5,   fast_CompositeOver_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,      PIXMAN_x8r8g8b8, fast_CompositeOver_x888_8_8888,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,      PIXMAN_a8r8g8b8, fast_CompositeOver_x888_8_8888,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,      PIXMAN_x8b8g8r8, fast_CompositeOver_x888_8_8888,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,      PIXMAN_a8b8g8r8, fast_CompositeOver_x888_8_8888,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, fast_composite_over_8888_8888,           0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,    PIXMAN_x8r8g8b8, fast_composite_over_8888_8888,    0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,    PIXMAN_r5g6b5,   fast_composite_over_8888_0565,    0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,    PIXMAN_a8b8g8r8, fast_composite_over_8888_8888,    0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,    PIXMAN_x8b8g8r8, fast_composite_over_8888_8888,    0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   fast_composite_over_8888_0565,           0 },
-    { PIXMAN_OP_ADD, PIXMAN_a8r8g8b8,  PIXMAN_null,    PIXMAN_a8r8g8b8, fast_CompositeAdd_8888_8888,   0 },
-    { PIXMAN_OP_ADD, PIXMAN_a8b8g8r8,  PIXMAN_null,    PIXMAN_a8b8g8r8, fast_CompositeAdd_8888_8888,   0 },
-    { PIXMAN_OP_ADD, PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       fast_CompositeAdd_8000_8000,   0 },
-    { PIXMAN_OP_ADD, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8,       fast_CompositeAdd_8888_8_8,    0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_null,     PIXMAN_a8r8g8b8, fast_CompositeSolidFill, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_null,     PIXMAN_x8r8g8b8, fast_CompositeSolidFill, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_null,     PIXMAN_a8b8g8r8, fast_CompositeSolidFill, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_null,     PIXMAN_x8b8g8r8, fast_CompositeSolidFill, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_null,     PIXMAN_a8,       fast_CompositeSolidFill, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_null,     PIXMAN_r5g6b5,   fast_CompositeSolidFill, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, fast_CompositeSrc_8888_x888, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, fast_CompositeSrc_8888_x888, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_x8b8g8r8, fast_CompositeSrc_8888_x888, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8,  PIXMAN_null,     PIXMAN_x8b8g8r8, fast_CompositeSrc_8888_x888, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_r5g6b5,   fast_CompositeSrc_x888_0565, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8,  PIXMAN_null,     PIXMAN_r5g6b5,   fast_CompositeSrc_x888_0565, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_b5g6r5,   fast_CompositeSrc_x888_0565, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8,  PIXMAN_null,     PIXMAN_b5g6r5,   fast_CompositeSrc_x888_0565, 0 },
-    { PIXMAN_OP_IN,  PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       fast_CompositeIn_8_8,   0 },
-    { PIXMAN_OP_IN,  PIXMAN_solid,     PIXMAN_a8,      PIXMAN_a8,       fast_CompositeIn_n_8_8, 0 },
-    { PIXMAN_OP_NONE },
-};
+    uint16_t tmp1, tmp2, tmp3, tmp4;
+    while ((w -= 4) >= 0)
+    {
+       tmp1 = src[pixman_fixed_to_int (vx)];
+       vx += unit_x;
+       tmp2 = src[pixman_fixed_to_int (vx)];
+       vx += unit_x;
+       tmp3 = src[pixman_fixed_to_int (vx)];
+       vx += unit_x;
+       tmp4 = src[pixman_fixed_to_int (vx)];
+       vx += unit_x;
+       *dst++ = tmp1;
+       *dst++ = tmp2;
+       *dst++ = tmp3;
+       *dst++ = tmp4;
+    }
+    if (w & 2)
+    {
+       tmp1 = src[pixman_fixed_to_int (vx)];
+       vx += unit_x;
+       tmp2 = src[pixman_fixed_to_int (vx)];
+       vx += unit_x;
+       *dst++ = tmp1;
+       *dst++ = tmp2;
+    }
+    if (w & 1)
+       *dst++ = src[pixman_fixed_to_int (vx)];
+}
+
+FAST_NEAREST_MAINLOOP (565_565_cover_SRC,
+                      scaled_nearest_scanline_565_565_SRC,
+                      uint16_t, uint16_t, COVER)
+FAST_NEAREST_MAINLOOP (565_565_none_SRC,
+                      scaled_nearest_scanline_565_565_SRC,
+                      uint16_t, uint16_t, NONE)
+FAST_NEAREST_MAINLOOP (565_565_pad_SRC,
+                      scaled_nearest_scanline_565_565_SRC,
+                      uint16_t, uint16_t, PAD)
+
+static force_inline uint32_t
+fetch_nearest (pixman_repeat_t src_repeat,
+              pixman_format_code_t format,
+              uint32_t *src, int x, int src_width)
+{
+    if (repeat (src_repeat, &x, src_width))
+    {
+       if (format == PIXMAN_x8r8g8b8)
+           return *(src + x) | 0xff000000;
+       else
+           return *(src + x);
+    }
+    else
+    {
+       return 0;
+    }
+}
+
+static force_inline void
+combine_over (uint32_t s, uint32_t *dst)
+{
+    if (s)
+    {
+       uint8_t ia = 0xff - (s >> 24);
+
+       if (ia)
+           UN8x4_MUL_UN8_ADD_UN8x4 (*dst, ia, s);
+       else
+           *dst = s;
+    }
+}
+
+static force_inline void
+combine_src (uint32_t s, uint32_t *dst)
+{
+    *dst = s;
+}
 
 static void
-fast_CompositeSrcScaleNearest (pixman_implementation_t *imp,
-                           pixman_op_t     op,
-                           pixman_image_t *src_image,
-                           pixman_image_t *mask_image,
-                           pixman_image_t *dst_image,
-                           int32_t         src_x,
-                           int32_t         src_y,
-                           int32_t         mask_x,
-                           int32_t         mask_y,
-                           int32_t         dest_x,
-                           int32_t         dest_y,
-                           int32_t        width,
-                           int32_t        height)
+fast_composite_scaled_nearest (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
 {
-    uint32_t       *dst;
-    uint32_t       *src;
-    int             dstStride, srcStride;
-    int             i, j;
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t       *dst_line;
+    uint32_t       *src_line;
+    int             dst_stride, src_stride;
+    int                    src_width, src_height;
+    pixman_repeat_t src_repeat;
+    pixman_fixed_t unit_x, unit_y;
+    pixman_format_code_t src_format;
     pixman_vector_t v;
-    
-    fbComposeGetStart (dst_image, dest_x, dest_y, uint32_t, dstStride, dst, 1);
+    pixman_fixed_t vy;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     /* pass in 0 instead of src_x and src_y because src_x and src_y need to be
-     * transformed from destination space to source space */
-    fbComposeGetStart (src_image, 0, 0, uint32_t, srcStride, src, 1);
-    
+     * transformed from destination space to source space
+     */
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, uint32_t, src_stride, src_line, 1);
+
     /* reference point is the center of the pixel */
-    v.vector[0] = pixman_int_to_fixed(src_x) + pixman_fixed_1 / 2;
-    v.vector[1] = pixman_int_to_fixed(src_y) + pixman_fixed_1 / 2;
+    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;
     v.vector[2] = pixman_fixed_1;
-    
+
     if (!pixman_transform_point_3d (src_image->common.transform, &v))
-        return;
-    
+       return;
+
+    unit_x = src_image->common.transform->matrix[0][0];
+    unit_y = src_image->common.transform->matrix[1][1];
+
     /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */
     v.vector[0] -= pixman_fixed_e;
     v.vector[1] -= pixman_fixed_e;
-    
-    for (j = 0; j < height; j++) {
+
+    src_height = src_image->bits.height;
+    src_width = src_image->bits.width;
+    src_repeat = src_image->common.repeat;
+    src_format = src_image->bits.format;
+
+    vy = v.vector[1];
+    while (height--)
+    {
         pixman_fixed_t vx = v.vector[0];
-        pixman_fixed_t vy = v.vector[1];
-        for (i = 0; i < width; ++i) {
-            pixman_bool_t inside_bounds;
-            uint32_t result;
-            int x, y;
-            x = vx >> 16;
-            y = vy >> 16;
-           
-            /* apply the repeat function */
-            switch (src_image->common.repeat) {
-           case PIXMAN_REPEAT_NORMAL:
-               x = MOD (x, src_image->bits.width);
-               y = MOD (y, src_image->bits.height);
-               inside_bounds = TRUE;
-               break;
-               
-           case PIXMAN_REPEAT_PAD:
-               x = CLIP (x, 0, src_image->bits.width-1);
-               y = CLIP (y, 0, src_image->bits.height-1);
-               inside_bounds = TRUE;
-               break;
-               
-           case PIXMAN_REPEAT_REFLECT:
-               x = MOD (x, src_image->bits.width * 2);
-               if (x >= src_image->bits.width)
-                   x = src_image->bits.width * 2 - x - 1;
-               y = MOD (y, src_image->bits.height * 2);
-               if (y >= src_image->bits.height)
-                   y = src_image->bits.height * 2 - y - 1;
-               inside_bounds = TRUE;
-               break;
-               
-           case PIXMAN_REPEAT_NONE:
-           default:
-               inside_bounds = (x >= 0 && x < src_image->bits.width && y >= 0 && y < src_image->bits.height);
-               break;
-            }
-           
-            if (inside_bounds) {
-                //XXX: we should move this multiplication out of the loop
-                result = *(src + y * srcStride + x);
-            } else {
-                result = 0;
-            }
-           *(dst + i) = result;
-           
-            /* adjust the x location by a unit vector in the x direction:
-             * this is equivalent to transforming x+1 of the destination point to source space */
-            vx += src_image->common.transform->matrix[0][0];
-        }
+       int y = pixman_fixed_to_int (vy);
+       uint32_t *dst = dst_line;
+
+       dst_line += dst_stride;
+
         /* adjust the y location by a unit vector in the y direction
          * this is equivalent to transforming y+1 of the destination point to source space */
-        v.vector[1] += src_image->common.transform->matrix[1][1];
-        dst += dstStride;
+        vy += unit_y;
+
+       if (!repeat (src_repeat, &y, src_height))
+       {
+           if (op == PIXMAN_OP_SRC)
+               memset (dst, 0, sizeof (*dst) * width);
+       }
+       else
+       {
+           int w = width;
+
+           uint32_t *src = src_line + y * src_stride;
+
+           while (w >= 2)
+           {
+               uint32_t s1, s2;
+               int x1, x2;
+
+               x1 = pixman_fixed_to_int (vx);
+               vx += unit_x;
+
+               x2 = pixman_fixed_to_int (vx);
+               vx += unit_x;
+
+               w -= 2;
+
+               s1 = fetch_nearest (src_repeat, src_format, src, x1, src_width);
+               s2 = fetch_nearest (src_repeat, src_format, src, x2, src_width);
+
+               if (op == PIXMAN_OP_OVER)
+               {
+                   combine_over (s1, dst++);
+                   combine_over (s2, dst++);
+               }
+               else
+               {
+                   combine_src (s1, dst++);
+                   combine_src (s2, dst++);
+               }
+           }
+
+           while (w--)
+           {
+               uint32_t s;
+               int x;
+
+               x = pixman_fixed_to_int (vx);
+               vx += unit_x;
+
+               s = fetch_nearest (src_repeat, src_format, src, x, src_width);
+
+               if (op == PIXMAN_OP_OVER)
+                   combine_over (s, dst++);
+               else
+                   combine_src (s, dst++);
+           }
+       }
     }
 }
 
-static void
-fast_path_composite (pixman_implementation_t *imp,
-                    pixman_op_t     op,
-                    pixman_image_t *src,
-                    pixman_image_t *mask,
-                    pixman_image_t *dest,
-                    int32_t         src_x,
-                    int32_t         src_y,
-                    int32_t         mask_x,
-                    int32_t         mask_y,
-                    int32_t         dest_x,
-                    int32_t         dest_y,
-                    int32_t        width,
-                    int32_t        height)
+#define CACHE_LINE_SIZE 64
+
+#define FAST_SIMPLE_ROTATE(suffix, pix_type)                                  \
+                                                                              \
+static void                                                                   \
+blt_rotated_90_trivial_##suffix (pix_type       *dst,                         \
+                                int             dst_stride,                  \
+                                const pix_type *src,                         \
+                                int             src_stride,                  \
+                                int             w,                           \
+                                int             h)                           \
+{                                                                             \
+    int x, y;                                                                 \
+    for (y = 0; y < h; y++)                                                   \
+    {                                                                         \
+       const pix_type *s = src + (h - y - 1);                                \
+       pix_type *d = dst + dst_stride * y;                                   \
+       for (x = 0; x < w; x++)                                               \
+       {                                                                     \
+           *d++ = *s;                                                        \
+           s += src_stride;                                                  \
+       }                                                                     \
+    }                                                                         \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+blt_rotated_270_trivial_##suffix (pix_type       *dst,                        \
+                                 int             dst_stride,                 \
+                                 const pix_type *src,                        \
+                                 int             src_stride,                 \
+                                 int             w,                          \
+                                 int             h)                          \
+{                                                                             \
+    int x, y;                                                                 \
+    for (y = 0; y < h; y++)                                                   \
+    {                                                                         \
+       const pix_type *s = src + src_stride * (w - 1) + y;                   \
+       pix_type *d = dst + dst_stride * y;                                   \
+       for (x = 0; x < w; x++)                                               \
+       {                                                                     \
+           *d++ = *s;                                                        \
+           s -= src_stride;                                                  \
+       }                                                                     \
+    }                                                                         \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+blt_rotated_90_##suffix (pix_type       *dst,                                 \
+                        int             dst_stride,                          \
+                        const pix_type *src,                                 \
+                        int             src_stride,                          \
+                        int             W,                                   \
+                        int             H)                                   \
+{                                                                             \
+    int x;                                                                    \
+    int leading_pixels = 0, trailing_pixels = 0;                              \
+    const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type);                 \
+                                                                              \
+    /*                                                                        \
+     * split processing into handling destination as TILE_SIZExH cache line   \
+     * aligned vertical stripes (optimistically assuming that destination     \
+     * stride is a multiple of cache line, if not - it will be just a bit     \
+     * slower)                                                                \
+     */                                                                       \
+                                                                              \
+    if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1))                               \
+    {                                                                         \
+       leading_pixels = TILE_SIZE - (((uintptr_t)dst &                       \
+                           (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
+       if (leading_pixels > W)                                               \
+           leading_pixels = W;                                               \
+                                                                              \
+       /* unaligned leading part NxH (where N < TILE_SIZE) */                \
+       blt_rotated_90_trivial_##suffix (                                     \
+           dst,                                                              \
+           dst_stride,                                                       \
+           src,                                                              \
+           src_stride,                                                       \
+           leading_pixels,                                                   \
+           H);                                                               \
+                                                                             \
+       dst += leading_pixels;                                                \
+       src += leading_pixels * src_stride;                                   \
+       W -= leading_pixels;                                                  \
+    }                                                                         \
+                                                                              \
+    if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1))                         \
+    {                                                                         \
+       trailing_pixels = (((uintptr_t)(dst + W) &                            \
+                           (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
+       if (trailing_pixels > W)                                              \
+           trailing_pixels = W;                                              \
+       W -= trailing_pixels;                                                 \
+    }                                                                         \
+                                                                              \
+    for (x = 0; x < W; x += TILE_SIZE)                                        \
+    {                                                                         \
+       /* aligned middle part TILE_SIZExH */                                 \
+       blt_rotated_90_trivial_##suffix (                                     \
+           dst + x,                                                          \
+           dst_stride,                                                       \
+           src + src_stride * x,                                             \
+           src_stride,                                                       \
+           TILE_SIZE,                                                        \
+           H);                                                               \
+    }                                                                         \
+                                                                              \
+    if (trailing_pixels)                                                      \
+    {                                                                         \
+       /* unaligned trailing part NxH (where N < TILE_SIZE) */               \
+       blt_rotated_90_trivial_##suffix (                                     \
+           dst + W,                                                          \
+           dst_stride,                                                       \
+           src + W * src_stride,                                             \
+           src_stride,                                                       \
+           trailing_pixels,                                                  \
+           H);                                                               \
+    }                                                                         \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+blt_rotated_270_##suffix (pix_type       *dst,                                \
+                         int             dst_stride,                         \
+                         const pix_type *src,                                \
+                         int             src_stride,                         \
+                         int             W,                                  \
+                         int             H)                                  \
+{                                                                             \
+    int x;                                                                    \
+    int leading_pixels = 0, trailing_pixels = 0;                              \
+    const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type);                 \
+                                                                              \
+    /*                                                                        \
+     * split processing into handling destination as TILE_SIZExH cache line   \
+     * aligned vertical stripes (optimistically assuming that destination     \
+     * stride is a multiple of cache line, if not - it will be just a bit     \
+     * slower)                                                                \
+     */                                                                       \
+                                                                              \
+    if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1))                               \
+    {                                                                         \
+       leading_pixels = TILE_SIZE - (((uintptr_t)dst &                       \
+                           (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
+       if (leading_pixels > W)                                               \
+           leading_pixels = W;                                               \
+                                                                              \
+       /* unaligned leading part NxH (where N < TILE_SIZE) */                \
+       blt_rotated_270_trivial_##suffix (                                    \
+           dst,                                                              \
+           dst_stride,                                                       \
+           src + src_stride * (W - leading_pixels),                          \
+           src_stride,                                                       \
+           leading_pixels,                                                   \
+           H);                                                               \
+                                                                             \
+       dst += leading_pixels;                                                \
+       W -= leading_pixels;                                                  \
+    }                                                                         \
+                                                                              \
+    if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1))                         \
+    {                                                                         \
+       trailing_pixels = (((uintptr_t)(dst + W) &                            \
+                           (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
+       if (trailing_pixels > W)                                              \
+           trailing_pixels = W;                                              \
+       W -= trailing_pixels;                                                 \
+       src += trailing_pixels * src_stride;                                  \
+    }                                                                         \
+                                                                              \
+    for (x = 0; x < W; x += TILE_SIZE)                                        \
+    {                                                                         \
+       /* aligned middle part TILE_SIZExH */                                 \
+       blt_rotated_270_trivial_##suffix (                                    \
+           dst + x,                                                          \
+           dst_stride,                                                       \
+           src + src_stride * (W - x - TILE_SIZE),                           \
+           src_stride,                                                       \
+           TILE_SIZE,                                                        \
+           H);                                                               \
+    }                                                                         \
+                                                                              \
+    if (trailing_pixels)                                                      \
+    {                                                                         \
+       /* unaligned trailing part NxH (where N < TILE_SIZE) */               \
+       blt_rotated_270_trivial_##suffix (                                    \
+           dst + W,                                                          \
+           dst_stride,                                                       \
+           src - trailing_pixels * src_stride,                               \
+           src_stride,                                                       \
+           trailing_pixels,                                                  \
+           H);                                                               \
+    }                                                                         \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+fast_composite_rotate_90_##suffix (pixman_implementation_t *imp,              \
+                                  pixman_composite_info_t *info)             \
+{                                                                            \
+    PIXMAN_COMPOSITE_ARGS (info);                                            \
+    pix_type       *dst_line;                                                \
+    pix_type       *src_line;                                                 \
+    int             dst_stride, src_stride;                                   \
+    int             src_x_t, src_y_t;                                         \
+                                                                              \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, pix_type,              \
+                          dst_stride, dst_line, 1);                          \
+    src_x_t = -src_y + pixman_fixed_to_int (                                  \
+                               src_image->common.transform->matrix[0][2] +   \
+                               pixman_fixed_1 / 2 - pixman_fixed_e) - height;\
+    src_y_t = src_x + pixman_fixed_to_int (                                   \
+                               src_image->common.transform->matrix[1][2] +   \
+                               pixman_fixed_1 / 2 - pixman_fixed_e);         \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type,             \
+                          src_stride, src_line, 1);                          \
+    blt_rotated_90_##suffix (dst_line, dst_stride, src_line, src_stride,      \
+                            width, height);                                  \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+fast_composite_rotate_270_##suffix (pixman_implementation_t *imp,             \
+                                   pixman_composite_info_t *info)            \
+{                                                                             \
+    PIXMAN_COMPOSITE_ARGS (info);                                            \
+    pix_type       *dst_line;                                                \
+    pix_type       *src_line;                                                 \
+    int             dst_stride, src_stride;                                   \
+    int             src_x_t, src_y_t;                                         \
+                                                                              \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, pix_type,              \
+                          dst_stride, dst_line, 1);                          \
+    src_x_t = src_y + pixman_fixed_to_int (                                   \
+                               src_image->common.transform->matrix[0][2] +   \
+                               pixman_fixed_1 / 2 - pixman_fixed_e);         \
+    src_y_t = -src_x + pixman_fixed_to_int (                                  \
+                               src_image->common.transform->matrix[1][2] +   \
+                               pixman_fixed_1 / 2 - pixman_fixed_e) - width; \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type,             \
+                          src_stride, src_line, 1);                          \
+    blt_rotated_270_##suffix (dst_line, dst_stride, src_line, src_stride,     \
+                             width, height);                                 \
+}
+
+FAST_SIMPLE_ROTATE (8, uint8_t)
+FAST_SIMPLE_ROTATE (565, uint16_t)
+FAST_SIMPLE_ROTATE (8888, uint32_t)
+
+static const pixman_fast_path_t c_fast_paths[] =
+{
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, fast_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, fast_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r8g8b8, fast_composite_over_n_8_0888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b8g8r8, fast_composite_over_n_8_0888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, fast_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, fast_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, fast_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, fast_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8r8g8b8, fast_composite_over_n_1_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8r8g8b8, fast_composite_over_n_1_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8b8g8r8, fast_composite_over_n_1_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8b8g8r8, fast_composite_over_n_1_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, r5g6b5,   fast_composite_over_n_1_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, b5g6r5,   fast_composite_over_n_1_0565),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, fast_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, fast_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, fast_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, fast_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, fast_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, fast_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, fast_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, fast_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, fast_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, fast_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, fast_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, fast_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, fast_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, fast_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, fast_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, fast_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, fast_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, fast_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8_8),
+    PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1000_1000),
+    PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, fast_composite_add_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, fast_composite_add_n_8_8),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8r8g8b8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, x8r8g8b8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8b8g8r8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, x8b8g8r8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a1, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, r5g6b5, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, fast_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, fast_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8x8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8a8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8x8, null, b8g8r8x8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, b8g8r8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (IN, a8, null, a8, fast_composite_in_8_8),
+    PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, fast_composite_in_n_8_8),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, 8888_8888),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, 8888_8888),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, 8888_565),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, 8888_565),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, 565_565),
+
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, x8r8g8b8, a8r8g8b8, x888_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, x8b8g8r8, a8b8g8r8, x888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, x8r8g8b8, a8r8g8b8, x888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, x8b8g8r8, a8b8g8r8, x888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (SRC, x8r8g8b8, a8r8g8b8, x888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (SRC, x8b8g8r8, a8b8g8r8, x888_8888),
+
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, 8888_8888),
+
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, 8888_565),
+
+#define NEAREST_FAST_PATH(op,s,d)              \
+    {   PIXMAN_OP_ ## op,                      \
+       PIXMAN_ ## s, SCALED_NEAREST_FLAGS,     \
+       PIXMAN_null, 0,                         \
+       PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+       fast_composite_scaled_nearest,          \
+    }
+
+    NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8),
+    NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8),
+    NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8),
+    NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8),
+
+    NEAREST_FAST_PATH (SRC, x8r8g8b8, a8r8g8b8),
+    NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8),
+    NEAREST_FAST_PATH (SRC, x8b8g8r8, a8b8g8r8),
+    NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8),
+
+    NEAREST_FAST_PATH (OVER, x8r8g8b8, x8r8g8b8),
+    NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8),
+    NEAREST_FAST_PATH (OVER, x8b8g8r8, x8b8g8r8),
+    NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8),
+
+    NEAREST_FAST_PATH (OVER, x8r8g8b8, a8r8g8b8),
+    NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8),
+    NEAREST_FAST_PATH (OVER, x8b8g8r8, a8b8g8r8),
+    NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8),
+
+#define SIMPLE_ROTATE_FLAGS(angle)                                       \
+    (FAST_PATH_ROTATE_ ## angle ## _TRANSFORM  |                         \
+     FAST_PATH_NEAREST_FILTER                  |                         \
+     FAST_PATH_SAMPLES_COVER_CLIP              |                         \
+     FAST_PATH_STANDARD_FLAGS)
+
+#define SIMPLE_ROTATE_FAST_PATH(op,s,d,suffix)                           \
+    {   PIXMAN_OP_ ## op,                                                \
+       PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (90),                           \
+       PIXMAN_null, 0,                                                   \
+       PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                           \
+       fast_composite_rotate_90_##suffix,                                \
+    },                                                                   \
+    {   PIXMAN_OP_ ## op,                                                \
+       PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (270),                          \
+       PIXMAN_null, 0,                                                   \
+       PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                           \
+       fast_composite_rotate_270_##suffix,                               \
+    }
+
+    SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888),
+    SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888),
+    SIMPLE_ROTATE_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888),
+    SIMPLE_ROTATE_FAST_PATH (SRC, r5g6b5, r5g6b5, 565),
+    SIMPLE_ROTATE_FAST_PATH (SRC, a8, a8, 8),
+
+    {   PIXMAN_OP_NONE },
+};
+
+#ifdef WORDS_BIGENDIAN
+#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (32 - (offs) - (n)))
+#else
+#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (offs))
+#endif
+
+static force_inline void
+pixman_fill1_line (uint32_t *dst, int offs, int width, int v)
 {
-    if (src->type == BITS
-        && src->common.transform
-        && !mask
-        && op == PIXMAN_OP_SRC
-        && !src->common.alpha_map && !dest->common.alpha_map
-        && (src->common.filter == PIXMAN_FILTER_NEAREST)
-        && PIXMAN_FORMAT_BPP(dest->bits.format) == 32
-        && src->bits.format == dest->bits.format
-        && !src->common.read_func && !src->common.write_func
-        && !dest->common.read_func && !dest->common.write_func)
+    if (offs)
     {
-        /* ensure that the transform matrix only has a scale */
-        if (src->common.transform->matrix[0][1] == 0 &&
-            src->common.transform->matrix[1][0] == 0 &&
-            src->common.transform->matrix[2][0] == 0 &&
-            src->common.transform->matrix[2][1] == 0 &&
-            src->common.transform->matrix[2][2] == pixman_fixed_1)
+       int leading_pixels = 32 - offs;
+       if (leading_pixels >= width)
        {
-           _pixman_walk_composite_region (imp, op,
-                                          src, mask, dest,
-                                          src_x, src_y,
-                                          mask_x, mask_y,
-                                          dest_x, dest_y,
-                                          width, height,
-                                          fast_CompositeSrcScaleNearest);
+           if (v)
+               *dst |= A1_FILL_MASK (width, offs);
+           else
+               *dst &= ~A1_FILL_MASK (width, offs);
            return;
        }
+       else
+       {
+           if (v)
+               *dst++ |= A1_FILL_MASK (leading_pixels, offs);
+           else
+               *dst++ &= ~A1_FILL_MASK (leading_pixels, offs);
+           width -= leading_pixels;
+       }
     }
-
-    if (_pixman_run_fast_path (c_fast_paths, imp,
-                              op, src, mask, dest,
-                              src_x, src_y,
-                              mask_x, mask_y,
-                              dest_x, dest_y,
-                              width, height))
+    while (width >= 32)
     {
-       return;
+       if (v)
+           *dst++ = 0xFFFFFFFF;
+       else
+           *dst++ = 0;
+       width -= 32;
+    }
+    if (width > 0)
+    {
+       if (v)
+           *dst |= A1_FILL_MASK (width, 0);
+       else
+           *dst &= ~A1_FILL_MASK (width, 0);
     }
+}
+
+static void
+pixman_fill1 (uint32_t *bits,
+              int       stride,
+              int       x,
+              int       y,
+              int       width,
+              int       height,
+              uint32_t  xor)
+{
+    uint32_t *dst = bits + y * stride + (x >> 5);
+    int offs = x & 31;
 
-    _pixman_implementation_composite (imp->delegate, op,
-                                     src, mask, dest,
-                                     src_x, src_y,
-                                     mask_x, mask_y,
-                                     dest_x, dest_y,
-                                     width, height);
+    if (xor & 1)
+    {
+       while (height--)
+       {
+           pixman_fill1_line (dst, offs, width, 1);
+           dst += stride;
+       }
+    }
+    else
+    {
+       while (height--)
+       {
+           pixman_fill1_line (dst, offs, width, 0);
+           dst += stride;
+       }
+    }
 }
 
 static void
-pixman_fill8 (uint32_t  *bits,
-             int       stride,
-             int       x,
-             int       y,
-             int       width,
-             int       height,
-             uint32_t  xor)
+pixman_fill8 (uint32_t *bits,
+              int       stride,
+              int       x,
+              int       y,
+              int       width,
+              int       height,
+              uint32_t xor)
 {
     int byte_stride = stride * (int) sizeof (uint32_t);
     uint8_t *dst = (uint8_t *) bits;
@@ -1259,14 +1893,15 @@ pixman_fill8 (uint32_t  *bits,
 
 static void
 pixman_fill16 (uint32_t *bits,
-              int       stride,
-              int       x,
-              int       y,
-              int       width,
-              int       height,
-              uint32_t  xor)
+               int       stride,
+               int       x,
+               int       y,
+               int       width,
+               int       height,
+               uint32_t xor)
 {
-    int short_stride = (stride * (int) sizeof (uint32_t)) / (int) sizeof (uint16_t);
+    int short_stride =
+       (stride * (int)sizeof (uint32_t)) / (int)sizeof (uint16_t);
     uint16_t *dst = (uint16_t *)bits;
     uint16_t v = xor & 0xffff;
     int i;
@@ -1284,12 +1919,12 @@ pixman_fill16 (uint32_t *bits,
 
 static void
 pixman_fill32 (uint32_t *bits,
-              int       stride,
-              int       x,
-              int       y,
-              int       width,
-              int       height,
-              uint32_t  xor)
+               int       stride,
+               int       x,
+               int       y,
+               int       width,
+               int       height,
+               uint32_t  xor)
 {
     int i;
 
@@ -1306,46 +1941,48 @@ pixman_fill32 (uint32_t *bits,
 
 static pixman_bool_t
 fast_path_fill (pixman_implementation_t *imp,
-               uint32_t *bits,
-               int stride,
-               int bpp,
-               int x,
-               int y,
-               int width,
-               int height,
-               uint32_t xor)
+                uint32_t *               bits,
+                int                      stride,
+                int                      bpp,
+                int                      x,
+                int                      y,
+                int                      width,
+                int                      height,
+                uint32_t                xor)
 {
     switch (bpp)
     {
+    case 1:
+       pixman_fill1 (bits, stride, x, y, width, height, xor);
+       break;
+
     case 8:
        pixman_fill8 (bits, stride, x, y, width, height, xor);
        break;
-       
+
     case 16:
        pixman_fill16 (bits, stride, x, y, width, height, xor);
        break;
-       
+
     case 32:
        pixman_fill32 (bits, stride, x, y, width, height, xor);
        break;
-       
+
     default:
        return _pixman_implementation_fill (
            imp->delegate, bits, stride, bpp, x, y, width, height, xor);
        break;
     }
-    
+
     return TRUE;
 }
 
 pixman_implementation_t *
-_pixman_implementation_create_fast_path (void)
+_pixman_implementation_create_fast_path (pixman_implementation_t *fallback)
 {
-    pixman_implementation_t *general = _pixman_implementation_create_general ();
-    pixman_implementation_t *imp = _pixman_implementation_create (general);
+    pixman_implementation_t *imp = _pixman_implementation_create (fallback, c_fast_paths);
 
-    imp->composite = fast_path_composite;
     imp->fill = fast_path_fill;
-    
+
     return imp;
 }