Add a version of fbcompose.c. Fixes and additions to make it compile
authorSøren Sandmann <sandmann@redhat.com>
Tue, 8 May 2007 02:36:15 +0000 (22:36 -0400)
committerSøren Sandmann <sandmann@redhat.com>
Tue, 8 May 2007 02:36:15 +0000 (22:36 -0400)
pixman/Makefile.am
pixman/pixman-compose.c [new file with mode: 0644]
pixman/pixman-image.c
pixman/pixman-private.h
pixman/pixman-region.c
pixman/pixman.h

index ef378d5..28584f9 100644 (file)
@@ -6,8 +6,8 @@ libpixman_la_SOURCES =    \
         pixman.h         \
         pixman-region.c  \
         pixman-private.h \
-        pixman-image.c
-#       pixman-compose.c
+        pixman-image.c   \
+        pixman-compose.c
 
 libpixmanincludedir = $(includedir)/libcomp
 libpixmaninclude_HEADERS = pixman.h
diff --git a/pixman/pixman-compose.c b/pixman/pixman-compose.c
new file mode 100644 (file)
index 0000000..68f5865
--- /dev/null
@@ -0,0 +1,4675 @@
+/*
+ *
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_DIX_CONFIG_H
+#include <dix-config.h>
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <assert.h>
+
+#include "pixman-private.h"
+
+/*
+ *    FIXME:
+ *             The stuff here is added just to get it to compile. Something sensible needs to
+ *              be done before this can be used.
+ *
+ *   we should go through this code and clean up some of the weird stuff that have
+ *   resulted from unmacro-ifying it.
+ *
+ */
+#define INLINE inline
+
+int PictureTransformPoint3d (pixman_transform_t *trans, pixman_vector_t *vector);
+
+#ifdef FB_ACCESS_WRAPPER
+
+#include "wfbrename.h"
+#define FBPREFIX(x) wfb##x
+#define WRITE(ptr, val) ((*wfbWriteMemory)((ptr), (val), sizeof(*(ptr))))
+#define READ(ptr) ((*wfbReadMemory)((ptr), sizeof(*(ptr))))
+
+#define MEMCPY_WRAPPED(dst, src, size) do {                       \
+       size_t _i;                                                      \
+       uint8_t *_dst = (uint8_t*)(dst), *_src = (uint8_t*)(src);       \
+       for(_i = 0; _i < size; _i++) {                                  \
+           WRITE(_dst +_i, READ(_src + _i));                           \
+       }                                                               \
+    } while(0)
+
+#define MEMSET_WRAPPED(dst, val, size) do {                       \
+       size_t _i;                                                    \
+       uint8_t *_dst = (uint8_t*)(dst);                              \
+       for(_i = 0; _i < size; _i++) {                                \
+           WRITE(_dst +_i, (val));                                   \
+       }                                                             \
+    } while(0)
+
+#else
+
+#define FBPREFIX(x) fb##x
+#define WRITE(ptr, val) (*(ptr) = (val))
+#define READ(ptr) (*(ptr))
+#define MEMCPY_WRAPPED(dst, src, size) memcpy((dst), (src), (size))
+#define MEMSET_WRAPPED(dst, val, size) memset((dst), (val), (size))
+
+#endif
+
+#ifdef FB_ACCESS_WRAPPER
+
+#define fbPrepareAccess(pDraw)                          \
+    fbGetScreenPrivate((pDraw)->pScreen)->setupWrap(    \
+       &wfbReadMemory,                                  \
+       &wfbWriteMemory,                                 \
+       (pDraw))
+#define fbFinishAccess(pDraw)                                          \
+    fbGetScreenPrivate((pDraw)->pScreen)->finishWrap(pDraw)
+
+#else
+
+#define fbPrepareAccess(pPix)
+#define fbFinishAccess(pDraw)
+
+#endif
+
+
+#if IMAGE_BYTE_ORDER == MSBFirst
+#define Fetch24(a)  ((unsigned long) (a) & 1 ?                       \
+                    ((READ(a) << 16) | READ((uint16_t *) ((a)+1))) : \
+                    ((READ((uint16_t *) (a)) << 8) | READ((a)+2)))
+#define Store24(a,v) ((unsigned long) (a) & 1 ?                \
+                     (WRITE(a, (uint8_t) ((v) >> 16)),               \
+                      WRITE((uint16_t *) ((a)+1), (uint16_t) (v))) :  \
+                     (WRITE((uint16_t *) (a), (uint16_t) ((v) >> 8)), \
+                      WRITE((a)+2, (uint8_t) (v))))
+#else
+#define Fetch24(a)  ((unsigned long) (a) & 1 ?                      \
+                    (READ(a) | (READ((uint16_t *) ((a)+1)) << 8)) : \
+                    (READ((uint16_t *) (a)) | (READ((a)+2) << 16)))
+#define Store24(a,v) ((unsigned long) (a) & 1 ? \
+                     (WRITE(a, (uint8_t) (v)),                         \
+                      WRITE((uint16_t *) ((a)+1), (uint16_t) ((v) >> 8))) : \
+                     (WRITE((uint16_t *) (a), (uint16_t) (v)),         \
+                      WRITE((a)+2, (uint8_t) ((v) >> 16))))
+#endif
+#define Alpha(x) ((x) >> 24)
+#define Red(x) (((x) >> 16) & 0xff)
+#define Green(x) (((x) >> 8) & 0xff)
+#define Blue(x) ((x) & 0xff)
+
+#define CvtR8G8B8toY15(s)       (((((s) >> 16) & 0xff) * 153 + \
+                                  (((s) >>  8) & 0xff) * 301 +         \
+                                  (((s)      ) & 0xff) * 58) >> 2)
+#define miCvtR8G8B8to15(s) ((((s) >> 3) & 0x001f) |  \
+                           (((s) >> 6) & 0x03e0) |  \
+                           (((s) >> 9) & 0x7c00))
+#define miIndexToEnt15(mif,rgb15) ((mif)->ent[rgb15])
+#define miIndexToEnt24(mif,rgb24) miIndexToEnt15(mif,miCvtR8G8B8to15(rgb24))
+
+#define miIndexToEntY24(mif,rgb24) ((mif)->ent[CvtR8G8B8toY15(rgb24)])
+
+
+#define FbIntMult(a,b,t) ( (t) = (a) * (b) + 0x80, ( ( ( (t)>>8 ) + (t) )>>8 ) )
+#define FbIntDiv(a,b)   (((uint16_t) (a) * 255) / (b))
+
+#define FbGet8(v,i)   ((uint16_t) (uint8_t) ((v) >> i))
+
+
+/*
+ * There are two ways of handling alpha -- either as a single unified value or
+ * a separate value for each component, hence each macro must have two
+ * versions.  The unified alpha version has a 'U' at the end of the name,
+ * the component version has a 'C'.  Similarly, functions which deal with
+ * this difference will have two versions using the same convention.
+ */
+
+#define FbOverU(x,y,i,a,t) ((t) = FbIntMult(FbGet8(y,i),(a),(t)) + FbGet8(x,i),        \
+                           (uint32_t) ((uint8_t) ((t) | (0 - ((t) >> 8)))) << (i))
+
+#define FbOverC(x,y,i,a,t) ((t) = FbIntMult(FbGet8(y,i),FbGet8(a,i),(t)) + FbGet8(x,i),        \
+                           (uint32_t) ((uint8_t) ((t) | (0 - ((t) >> 8)))) << (i))
+
+#define FbInU(x,i,a,t) ((uint32_t) FbIntMult(FbGet8(x,i),(a),(t)) << (i))
+
+#define FbInC(x,i,a,t) ((uint32_t) FbIntMult(FbGet8(x,i),FbGet8(a,i),(t)) << (i))
+
+#define FbGen(x,y,i,ax,ay,t,u,v) ((t) = (FbIntMult(FbGet8(y,i),ay,(u)) + \
+                                        FbIntMult(FbGet8(x,i),ax,(v))), \
+                                 (uint32_t) ((uint8_t) ((t) |          \
+                                                        (0 - ((t) >> 8)))) << (i))
+
+#define FbAdd(x,y,i,t) ((t) = FbGet8(x,i) + FbGet8(y,i),               \
+                        (uint32_t) ((uint8_t) ((t) | (0 - ((t) >> 8)))) << (i))
+
+
+/*
+  The methods below use some tricks to be able to do two color
+  components at the same time.
+*/
+
+/*
+  x_c = (x_c * a) / 255
+*/
+#define FbByteMul(x, a) do {                                       \
+        uint32_t t = ((x & 0xff00ff) * a) + 0x800080;               \
+        t = (t + ((t >> 8) & 0xff00ff)) >> 8;                      \
+        t &= 0xff00ff;                                             \
+                                                                   \
+        x = (((x >> 8) & 0xff00ff) * a) + 0x800080;                \
+        x = (x + ((x >> 8) & 0xff00ff));                           \
+        x &= 0xff00ff00;                                           \
+        x += t;                                                            \
+    } while (0)
+
+/*
+  x_c = (x_c * a) / 255 + y
+*/
+#define FbByteMulAdd(x, a, y) do {                                 \
+        uint32_t t = ((x & 0xff00ff) * a) + 0x800080;               \
+        t = (t + ((t >> 8) & 0xff00ff)) >> 8;                      \
+        t &= 0xff00ff;                                             \
+        t += y & 0xff00ff;                                         \
+        t |= 0x1000100 - ((t >> 8) & 0xff00ff);                            \
+        t &= 0xff00ff;                                             \
+                                                                   \
+        x = (((x >> 8) & 0xff00ff) * a) + 0x800080;                 \
+        x = (x + ((x >> 8) & 0xff00ff)) >> 8;                       \
+        x &= 0xff00ff;                                              \
+        x += (y >> 8) & 0xff00ff;                                   \
+        x |= 0x1000100 - ((x >> 8) & 0xff00ff);                     \
+        x &= 0xff00ff;                                              \
+        x <<= 8;                                                    \
+        x += t;                                                     \
+    } while (0)
+
+/*
+  x_c = (x_c * a + y_c * b) / 255
+*/
+#define FbByteAddMul(x, a, y, b) do {                                   \
+        uint32_t t;                                                    \
+        uint32_t r = (x >> 24) * a + (y >> 24) * b + 0x80;             \
+        r += (r >> 8);                                                  \
+        r >>= 8;                                                        \
+                                                                       \
+        t = (x & 0xff00) * a + (y & 0xff00) * b;                        \
+        t += (t >> 8) + 0x8000;                                         \
+        t >>= 16;                                                       \
+                                                                       \
+        t |= r << 16;                                                   \
+        t |= 0x1000100 - ((t >> 8) & 0xff00ff);                         \
+        t &= 0xff00ff;                                                  \
+        t <<= 8;                                                        \
+                                                                       \
+        r = ((x >> 16) & 0xff) * a + ((y >> 16) & 0xff) * b + 0x80;     \
+        r += (r >> 8);                                                  \
+        r >>= 8;                                                        \
+                                                                       \
+        x = (x & 0xff) * a + (y & 0xff) * b + 0x80;                     \
+        x += (x >> 8);                                                  \
+        x >>= 8;                                                        \
+        x |= r << 16;                                                   \
+        x |= 0x1000100 - ((x >> 8) & 0xff00ff);                         \
+        x &= 0xff00ff;                                                  \
+        x |= t;                                                         \
+    } while (0)
+
+/*
+  x_c = (x_c * a + y_c *b) / 256
+*/
+#define FbByteAddMul_256(x, a, y, b) do {                               \
+        uint32_t t = (x & 0xff00ff) * a + (y & 0xff00ff) * b;          \
+        t >>= 8;                                                        \
+        t &= 0xff00ff;                                                  \
+                                                                       \
+        x = ((x >> 8) & 0xff00ff) * a + ((y >> 8) & 0xff00ff) * b;      \
+        x &= 0xff00ff00;                                                \
+        x += t;                                                         \
+    } while (0)
+/*
+  x_c = (x_c * a_c) / 255
+*/
+#define FbByteMulC(x, a) do {                            \
+        uint32_t t;                                       \
+        uint32_t r = (x & 0xff) * (a & 0xff);             \
+        r |= (x & 0xff0000) * ((a >> 16) & 0xff);        \
+       r += 0x800080;                                    \
+        r = (r + ((r >> 8) & 0xff00ff)) >> 8;            \
+        r &= 0xff00ff;                                   \
+                                                         \
+        x >>= 8;                                         \
+        t = (x & 0xff) * ((a >> 8) & 0xff);              \
+        t |= (x & 0xff0000) * (a >> 24);                 \
+        t += 0x800080;                                   \
+        t = t + ((t >> 8) & 0xff00ff);                   \
+        x = r | (t & 0xff00ff00);                        \
+                                                         \
+    } while (0)
+
+/*
+  x_c = (x_c * a) / 255 + y
+*/
+#define FbByteMulAddC(x, a, y) do {                                  \
+        uint32_t t;                                                   \
+        uint32_t r = (x & 0xff) * (a & 0xff);                         \
+        r |= (x & 0xff0000) * ((a >> 16) & 0xff);                    \
+       r += 0x800080;                                                \
+       r = (r + ((r >> 8) & 0xff00ff)) >> 8;                         \
+        r &= 0xff00ff;                                               \
+        r += y & 0xff00ff;                                           \
+        r |= 0x1000100 - ((r >> 8) & 0xff00ff);                              \
+        r &= 0xff00ff;                                               \
+                                                                     \
+        x >>= 8;                                                       \
+        t = (x & 0xff) * ((a >> 8) & 0xff);                            \
+        t |= (x & 0xff0000) * (a >> 24);                               \
+       t += 0x800080;                                                 \
+        t = (t + ((t >> 8) & 0xff00ff)) >> 8;                         \
+        t &= 0xff00ff;                                                 \
+        t += (y >> 8) & 0xff00ff;                                      \
+        t |= 0x1000100 - ((t >> 8) & 0xff00ff);                        \
+        t &= 0xff00ff;                                                 \
+        x = r | (t << 8);                                              \
+    } while (0)
+
+/*
+  x_c = (x_c * a_c + y_c * b) / 255
+*/
+#define FbByteAddMulC(x, a, y, b) do {                                  \
+        uint32_t t;                                                    \
+        uint32_t r = (x >> 24) * (a >> 24) + (y >> 24) * b;            \
+        r += (r >> 8) + 0x80;                                           \
+        r >>= 8;                                                        \
+                                                                       \
+        t = (x & 0xff00) * ((a >> 8) & 0xff) + (y & 0xff00) * b;        \
+        t += (t >> 8) + 0x8000;                                         \
+        t >>= 16;                                                       \
+                                                                       \
+        t |= r << 16;                                                   \
+        t |= 0x1000100 - ((t >> 8) & 0xff00ff);                         \
+        t &= 0xff00ff;                                                  \
+        t <<= 8;                                                        \
+                                                                       \
+        r = ((x >> 16) & 0xff) * ((a >> 16) & 0xff) + ((y >> 16) & 0xff) * b + 0x80; \
+        r += (r >> 8);                                                  \
+        r >>= 8;                                                        \
+                                                                       \
+        x = (x & 0xff) * (a & 0xff) + (y & 0xff) * b + 0x80;            \
+        x += (x >> 8);                                                  \
+        x >>= 8;                                                        \
+        x |= r << 16;                                                   \
+        x |= 0x1000100 - ((x >> 8) & 0xff00ff);                         \
+        x &= 0xff00ff;                                                  \
+        x |= t;                                                         \
+    } while (0)
+
+/*
+  x_c = min(x_c + y_c, 255)
+*/
+#define FbByteAdd(x, y) do {                                            \
+        uint32_t t;                                                    \
+        uint32_t r = (x & 0xff00ff) + (y & 0xff00ff);                  \
+        r |= 0x1000100 - ((r >> 8) & 0xff00ff);                         \
+        r &= 0xff00ff;                                                  \
+                                                                       \
+        t = ((x >> 8) & 0xff00ff) + ((y >> 8) & 0xff00ff);              \
+        t |= 0x1000100 - ((t >> 8) & 0xff00ff);                         \
+        r |= (t & 0xff00ff) << 8;                                       \
+        x = r;                                                          \
+    } while (0)
+
+#define div_255(x) (((x) + 0x80 + (((x) + 0x80) >> 8)) >> 8)
+
+typedef struct _FbComposeData {
+    uint8_t     op;
+    image_t    *src;
+    image_t    *mask;
+    image_t    *dest;
+    int16_t     xSrc;
+    int16_t     ySrc;
+    int16_t     xMask;
+    int16_t     yMask;
+    int16_t     xDest;
+    int16_t     yDest;
+    uint16_t    width;
+    uint16_t    height;
+} FbComposeData;
+
+/*   End of stuff added to get it to compile
+ */ 
+
+
+
+static unsigned int
+SourcePictureClassify (source_image_t *pict,
+                      int             x,
+                      int             y,
+                      int             width,
+                      int             height)
+{
+    if (pict->common.type == SOLID)
+    {
+       pict->class = SOURCE_IMAGE_CLASS_HORIZONTAL;
+    }
+    else if (pict->common.type == LINEAR)
+    {
+       linear_gradient_t *linear = (linear_gradient_t *)pict;
+       pixman_vector_t   v;
+       pixman_fixed_32_32_t l;
+       pixman_fixed_48_16_t dx, dy, a, b, off;
+       pixman_fixed_48_16_t factors[4];
+       int          i;
+       
+       dx = linear->p2.x - linear->p1.x;
+       dy = linear->p2.y - linear->p1.y;
+       l = dx * dx + dy * dy;
+       if (l)
+       {
+           a = (dx << 32) / l;
+           b = (dy << 32) / l;
+       }
+       else
+       {
+           a = b = 0;
+       }
+       
+       off = (-a * linear->p1.x
+              -b * linear->p1.y) >> 16;
+       
+       for (i = 0; i < 3; i++)
+       {
+           v.vector[0] = pixman_int_to_fixed ((i % 2) * (width  - 1) + x);
+           v.vector[1] = pixman_int_to_fixed ((i / 2) * (height - 1) + y);
+           v.vector[2] = pixman_fixed_1;
+           
+           if (pict->common.transform)
+           {
+               if (!PictureTransformPoint3d (pict->common.transform, &v))
+                   return SOURCE_IMAGE_CLASS_UNKNOWN;
+           }
+           
+           factors[i] = ((a * v.vector[0] + b * v.vector[1]) >> 16) + off;
+       }
+       
+       if (factors[2] == factors[0])
+           pict->class = SOURCE_IMAGE_CLASS_HORIZONTAL;
+       else if (factors[1] == factors[0])
+           pict->class = SOURCE_IMAGE_CLASS_VERTICAL;
+    }
+    
+    return pict->class;
+}
+
+#define mod(a,b)       ((b) == 1 ? 0 : (a) >= 0 ? (a) % (b) : (b) - (-a) % (b))
+
+#define SCANLINE_BUFFER_LENGTH 2048
+
+typedef FASTCALL void (*fetchProc)(const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed);
+
+/*
+ * All of the fetch functions
+ */
+
+static FASTCALL void
+fbFetch_a8r8g8b8 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    MEMCPY_WRAPPED(buffer, (const uint32_t *)bits + x, width*sizeof(uint32_t));
+}
+
+static FASTCALL void
+fbFetch_x8r8g8b8 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    const uint32_t *pixel = (const uint32_t *)bits + x;
+    const uint32_t *end = pixel + width;
+    while (pixel < end) {
+        WRITE(buffer++, READ(pixel++) | 0xff000000);
+    }
+}
+
+static FASTCALL void
+fbFetch_a8b8g8r8 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    const uint32_t *pixel = (uint32_t *)bits + x;
+    const uint32_t *end = pixel + width;
+    while (pixel < end) {
+        WRITE(buffer++, ((READ(pixel) & 0xff00ff00) |
+                         ((READ(pixel) >> 16) & 0xff) |
+                         ((READ(pixel) & 0xff) << 16)));
+        ++pixel;
+    }
+}
+
+static FASTCALL void
+fbFetch_x8b8g8r8 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    const uint32_t *pixel = (uint32_t *)bits + x;
+    const uint32_t *end = pixel + width;
+    while (pixel < end) {
+        WRITE(buffer++, 0xff000000 |
+             ((READ(pixel) & 0x0000ff00) |
+              ((READ(pixel) >> 16) & 0xff) |
+              ((READ(pixel) & 0xff) << 16)));
+        ++pixel;
+    }
+}
+
+static FASTCALL void
+fbFetch_r8g8b8 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    const uint8_t *pixel = (const uint8_t *)bits + 3*x;
+    const uint8_t *end = pixel + 3*width;
+    while (pixel < end) {
+        uint32_t b = Fetch24(pixel) | 0xff000000;
+        pixel += 3;
+        WRITE(buffer++, b);
+    }
+}
+
+static FASTCALL void
+fbFetch_b8g8r8 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    const uint8_t *pixel = (const uint8_t *)bits + 3*x;
+    const uint8_t *end = pixel + 3*width;
+    while (pixel < end) {
+        uint32_t b = 0xff000000;
+#if IMAGE_BYTE_ORDER == MSBFirst
+        b |= (READ(pixel++));
+        b |= (READ(pixel++) << 8);
+        b |= (READ(pixel++) << 16);
+#else
+        b |= (READ(pixel++) << 16);
+        b |= (READ(pixel++) << 8);
+        b |= (READ(pixel++));
+#endif
+    }
+}
+
+static FASTCALL void
+fbFetch_r5g6b5 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    const uint16_t *pixel = (const uint16_t *)bits + x;
+    const uint16_t *end = pixel + width;
+    while (pixel < end) {
+        uint32_t  p = READ(pixel++);
+        uint32_t r = (((p) << 3) & 0xf8) | 
+           (((p) << 5) & 0xfc00) |
+           (((p) << 8) & 0xf80000);
+        r |= (r >> 5) & 0x70007;
+        r |= (r >> 6) & 0x300;
+        WRITE(buffer++, 0xff000000 | r);
+    }
+}
+
+static FASTCALL void
+fbFetch_b5g6r5 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    const uint16_t *pixel = (const uint16_t *)bits + x;
+    const uint16_t *end = pixel + width;
+    while (pixel < end) {
+        uint32_t  p = READ(pixel++);
+        uint32_t  r,g,b;
+       
+        b = ((p & 0xf800) | ((p & 0xe000) >> 5)) >> 8;
+        g = ((p & 0x07e0) | ((p & 0x0600) >> 6)) << 5;
+        r = ((p & 0x001c) | ((p & 0x001f) << 5)) << 14;
+        WRITE(buffer++, (0xff000000 | r | g | b));
+    }
+}
+
+static FASTCALL void
+fbFetch_a1r5g5b5 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    const uint16_t *pixel = (const uint16_t *)bits + x;
+    const uint16_t *end = pixel + width;
+    while (pixel < end) {
+        uint32_t  p = READ(pixel++);
+        uint32_t  r,g,b, a;
+       
+        a = (uint32_t) ((uint8_t) (0 - ((p & 0x8000) >> 15))) << 24;
+        r = ((p & 0x7c00) | ((p & 0x7000) >> 5)) << 9;
+        g = ((p & 0x03e0) | ((p & 0x0380) >> 5)) << 6;
+        b = ((p & 0x001c) | ((p & 0x001f) << 5)) >> 2;
+        WRITE(buffer++, (a | r | g | b));
+    }
+}
+
+static FASTCALL void
+fbFetch_x1r5g5b5 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    const uint16_t *pixel = (const uint16_t *)bits + x;
+    const uint16_t *end = pixel + width;
+    while (pixel < end) {
+        uint32_t  p = READ(pixel++);
+        uint32_t  r,g,b;
+       
+        r = ((p & 0x7c00) | ((p & 0x7000) >> 5)) << 9;
+        g = ((p & 0x03e0) | ((p & 0x0380) >> 5)) << 6;
+        b = ((p & 0x001c) | ((p & 0x001f) << 5)) >> 2;
+        WRITE(buffer++, (0xff000000 | r | g | b));
+    }
+}
+
+static FASTCALL void
+fbFetch_a1b5g5r5 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    const uint16_t *pixel = (const uint16_t *)bits + x;
+    const uint16_t *end = pixel + width;
+    while (pixel < end) {
+        uint32_t  p = READ(pixel++);
+        uint32_t  r,g,b, a;
+       
+        a = (uint32_t) ((uint8_t) (0 - ((p & 0x8000) >> 15))) << 24;
+        b = ((p & 0x7c00) | ((p & 0x7000) >> 5)) >> 7;
+        g = ((p & 0x03e0) | ((p & 0x0380) >> 5)) << 6;
+        r = ((p & 0x001c) | ((p & 0x001f) << 5)) << 14;
+        WRITE(buffer++, (a | r | g | b));
+    }
+}
+
+static FASTCALL void
+fbFetch_x1b5g5r5 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    const uint16_t *pixel = (const uint16_t *)bits + x;
+    const uint16_t *end = pixel + width;
+    while (pixel < end) {
+        uint32_t  p = READ(pixel++);
+        uint32_t  r,g,b;
+       
+        b = ((p & 0x7c00) | ((p & 0x7000) >> 5)) >> 7;
+        g = ((p & 0x03e0) | ((p & 0x0380) >> 5)) << 6;
+        r = ((p & 0x001c) | ((p & 0x001f) << 5)) << 14;
+        WRITE(buffer++, (0xff000000 | r | g | b));
+    }
+}
+
+static FASTCALL void
+fbFetch_a4r4g4b4 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    const uint16_t *pixel = (const uint16_t *)bits + x;
+    const uint16_t *end = pixel + width;
+    while (pixel < end) {
+        uint32_t  p = READ(pixel++);
+        uint32_t  r,g,b, a;
+       
+        a = ((p & 0xf000) | ((p & 0xf000) >> 4)) << 16;
+        r = ((p & 0x0f00) | ((p & 0x0f00) >> 4)) << 12;
+        g = ((p & 0x00f0) | ((p & 0x00f0) >> 4)) << 8;
+        b = ((p & 0x000f) | ((p & 0x000f) << 4));
+        WRITE(buffer++, (a | r | g | b));
+    }
+}
+
+static FASTCALL void
+fbFetch_x4r4g4b4 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    const uint16_t *pixel = (const uint16_t *)bits + x;
+    const uint16_t *end = pixel + width;
+    while (pixel < end) {
+        uint32_t  p = READ(pixel++);
+        uint32_t  r,g,b;
+       
+        r = ((p & 0x0f00) | ((p & 0x0f00) >> 4)) << 12;
+        g = ((p & 0x00f0) | ((p & 0x00f0) >> 4)) << 8;
+        b = ((p & 0x000f) | ((p & 0x000f) << 4));
+        WRITE(buffer++, (0xff000000 | r | g | b));
+    }
+}
+
+static FASTCALL void
+fbFetch_a4b4g4r4 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    const uint16_t *pixel = (const uint16_t *)bits + x;
+    const uint16_t *end = pixel + width;
+    while (pixel < end) {
+        uint32_t  p = READ(pixel++);
+        uint32_t  r,g,b, a;
+       
+        a = ((p & 0xf000) | ((p & 0xf000) >> 4)) << 16;
+        b = ((p & 0x0f00) | ((p & 0x0f00) >> 4)) >> 4;
+        g = ((p & 0x00f0) | ((p & 0x00f0) >> 4)) << 8;
+        r = ((p & 0x000f) | ((p & 0x000f) << 4)) << 16;
+        WRITE(buffer++, (a | r | g | b));
+    }
+}
+
+static FASTCALL void
+fbFetch_x4b4g4r4 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    const uint16_t *pixel = (const uint16_t *)bits + x;
+    const uint16_t *end = pixel + width;
+    while (pixel < end) {
+        uint32_t  p = READ(pixel++);
+        uint32_t  r,g,b;
+       
+        b = ((p & 0x0f00) | ((p & 0x0f00) >> 4)) >> 4;
+        g = ((p & 0x00f0) | ((p & 0x00f0) >> 4)) << 8;
+        r = ((p & 0x000f) | ((p & 0x000f) << 4)) << 16;
+        WRITE(buffer++, (0xff000000 | r | g | b));
+    }
+}
+
+static FASTCALL void
+fbFetch_a8 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    const uint8_t *pixel = (const uint8_t *)bits + x;
+    const uint8_t *end = pixel + width;
+    while (pixel < end) {
+        WRITE(buffer++, READ(pixel++) << 24);
+    }
+}
+
+static FASTCALL void
+fbFetch_r3g3b2 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    const uint8_t *pixel = (const uint8_t *)bits + x;
+    const uint8_t *end = pixel + width;
+    while (pixel < end) {
+        uint32_t  p = READ(pixel++);
+        uint32_t  r,g,b;
+       
+        r = ((p & 0xe0) | ((p & 0xe0) >> 3) | ((p & 0xc0) >> 6)) << 16;
+        g = ((p & 0x1c) | ((p & 0x18) >> 3) | ((p & 0x1c) << 3)) << 8;
+        b = (((p & 0x03)     ) |
+             ((p & 0x03) << 2) |
+             ((p & 0x03) << 4) |
+             ((p & 0x03) << 6));
+        WRITE(buffer++, (0xff000000 | r | g | b));
+    }
+}
+
+static FASTCALL void
+fbFetch_b2g3r3 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    const uint8_t *pixel = (const uint8_t *)bits + x;
+    const uint8_t *end = pixel + width;
+    while (pixel < end) {
+        uint32_t  p = READ(pixel++);
+        uint32_t  r,g,b;
+       
+        b = (((p & 0xc0)     ) |
+             ((p & 0xc0) >> 2) |
+             ((p & 0xc0) >> 4) |
+             ((p & 0xc0) >> 6));
+        g = ((p & 0x38) | ((p & 0x38) >> 3) | ((p & 0x30) << 2)) << 8;
+        r = (((p & 0x07)     ) |
+             ((p & 0x07) << 3) |
+             ((p & 0x06) << 6)) << 16;
+        WRITE(buffer++, (0xff000000 | r | g | b));
+    }
+}
+
+static FASTCALL void
+fbFetch_a2r2g2b2 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    const uint8_t *pixel = (const uint8_t *)bits + x;
+    const uint8_t *end = pixel + width;
+    while (pixel < end) {
+        uint32_t  p = READ(pixel++);
+        uint32_t   a,r,g,b;
+       
+        a = ((p & 0xc0) * 0x55) << 18;
+        r = ((p & 0x30) * 0x55) << 12;
+        g = ((p & 0x0c) * 0x55) << 6;
+        b = ((p & 0x03) * 0x55);
+        WRITE(buffer++, a|r|g|b);
+    }
+}
+
+static FASTCALL void
+fbFetch_a2b2g2r2 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    const uint8_t *pixel = (const uint8_t *)bits + x;
+    const uint8_t *end = pixel + width;
+    while (pixel < end) {
+        uint32_t  p = READ(pixel++);
+        uint32_t   a,r,g,b;
+       
+        a = ((p & 0xc0) * 0x55) << 18;
+        b = ((p & 0x30) * 0x55) >> 6;
+        g = ((p & 0x0c) * 0x55) << 6;
+        r = ((p & 0x03) * 0x55) << 16;
+        WRITE(buffer++, a|r|g|b);
+    }
+}
+
+static FASTCALL void
+fbFetch_c8 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    const uint8_t *pixel = (const uint8_t *)bits + x;
+    const uint8_t *end = pixel + width;
+    while (pixel < end) {
+        uint32_t  p = READ(pixel++);
+        WRITE(buffer++, indexed->rgba[p]);
+    }
+}
+
+static FASTCALL void
+fbFetch_x4a4 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    const uint8_t *pixel = (const uint8_t *)bits + x;
+    const uint8_t *end = pixel + width;
+    while (pixel < end) {
+       uint8_t p = READ(pixel++) & 0xf;
+        WRITE(buffer++, (p | (p << 4)) << 24);
+    }
+}
+
+#define Fetch8(l,o)    (((uint8_t *) (l))[(o) >> 2])
+#if IMAGE_BYTE_ORDER == MSBFirst
+#define Fetch4(l,o)    ((o) & 2 ? Fetch8(l,o) & 0xf : Fetch8(l,o) >> 4)
+#else
+#define Fetch4(l,o)    ((o) & 2 ? Fetch8(l,o) >> 4 : Fetch8(l,o) & 0xf)
+#endif
+
+static FASTCALL void
+fbFetch_a4 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        uint32_t  p = Fetch4(bits, i + x);
+       
+        p |= p << 4;
+        WRITE(buffer++, p << 24);
+    }
+}
+
+static FASTCALL void
+fbFetch_r1g2b1 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        uint32_t  p = Fetch4(bits, i + x);
+        uint32_t  r,g,b;
+       
+        r = ((p & 0x8) * 0xff) << 13;
+        g = ((p & 0x6) * 0x55) << 7;
+        b = ((p & 0x1) * 0xff);
+        WRITE(buffer++, 0xff000000|r|g|b);
+    }
+}
+
+static FASTCALL void
+fbFetch_b1g2r1 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        uint32_t  p = Fetch4(bits, i + x);
+        uint32_t  r,g,b;
+       
+        b = ((p & 0x8) * 0xff) >> 3;
+        g = ((p & 0x6) * 0x55) << 7;
+        r = ((p & 0x1) * 0xff) << 16;
+        WRITE(buffer++, 0xff000000|r|g|b);
+    }
+}
+
+static FASTCALL void
+fbFetch_a1r1g1b1 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        uint32_t  p = Fetch4(bits, i + x);
+        uint32_t  a,r,g,b;
+       
+        a = ((p & 0x8) * 0xff) << 21;
+        r = ((p & 0x4) * 0xff) << 14;
+        g = ((p & 0x2) * 0xff) << 7;
+        b = ((p & 0x1) * 0xff);
+        WRITE(buffer++, a|r|g|b);
+    }
+}
+
+static FASTCALL void
+fbFetch_a1b1g1r1 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        uint32_t  p = Fetch4(bits, i + x);
+        uint32_t  a,r,g,b;
+       
+        a = ((p & 0x8) * 0xff) << 21;
+        r = ((p & 0x4) * 0xff) >> 3;
+        g = ((p & 0x2) * 0xff) << 7;
+        b = ((p & 0x1) * 0xff) << 16;
+        WRITE(buffer++, a|r|g|b);
+    }
+}
+
+static FASTCALL void
+fbFetch_c4 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        uint32_t  p = Fetch4(bits, i + x);
+       
+        WRITE(buffer++, indexed->rgba[p]);
+    }
+}
+
+
+static FASTCALL void
+fbFetch_a1 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        uint32_t  p = ((uint32_t *)bits)[(i + x) >> 5];
+        uint32_t  a;
+#if BITMAP_BIT_ORDER == MSBFirst
+        a = p >> (0x1f - ((i+x) & 0x1f));
+#else
+        a = p >> ((i+x) & 0x1f);
+#endif
+        a = a & 1;
+        a |= a << 1;
+        a |= a << 2;
+        a |= a << 4;
+        WRITE(buffer++, a << 24);
+    }
+}
+
+static FASTCALL void
+fbFetch_g1 (const uint32_t *bits, int x, int width, uint32_t *buffer, pixman_indexed_t * indexed)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        uint32_t  p = ((uint32_t *)bits)[(i+x) >> 5];
+        uint32_t a;
+#if BITMAP_BIT_ORDER == MSBFirst
+        a = p >> (0x1f - ((i+x) & 0x1f));
+#else
+        a = p >> ((i+x) & 0x1f);
+#endif
+        a = a & 1;
+        WRITE(buffer++, indexed->rgba[a]);
+    }
+}
+
+static fetchProc fetchProcForPicture (bits_image_t * pict)
+{
+    switch(pict->format) {
+    case PIXMAN_a8r8g8b8: return fbFetch_a8r8g8b8;
+    case PIXMAN_x8r8g8b8: return fbFetch_x8r8g8b8;
+    case PIXMAN_a8b8g8r8: return fbFetch_a8b8g8r8;
+    case PIXMAN_x8b8g8r8: return fbFetch_x8b8g8r8;
+       
+        /* 24bpp formats */
+    case PIXMAN_r8g8b8: return fbFetch_r8g8b8;
+    case PIXMAN_b8g8r8: return fbFetch_b8g8r8;
+       
+        /* 16bpp formats */
+    case PIXMAN_r5g6b5: return fbFetch_r5g6b5;
+    case PIXMAN_b5g6r5: return fbFetch_b5g6r5;
+       
+    case PIXMAN_a1r5g5b5: return fbFetch_a1r5g5b5;
+    case PIXMAN_x1r5g5b5: return fbFetch_x1r5g5b5;
+    case PIXMAN_a1b5g5r5: return fbFetch_a1b5g5r5;
+    case PIXMAN_x1b5g5r5: return fbFetch_x1b5g5r5;
+    case PIXMAN_a4r4g4b4: return fbFetch_a4r4g4b4;
+    case PIXMAN_x4r4g4b4: return fbFetch_x4r4g4b4;
+    case PIXMAN_a4b4g4r4: return fbFetch_a4b4g4r4;
+    case PIXMAN_x4b4g4r4: return fbFetch_x4b4g4r4;
+       
+        /* 8bpp formats */
+    case PIXMAN_a8: return  fbFetch_a8;
+    case PIXMAN_r3g3b2: return fbFetch_r3g3b2;
+    case PIXMAN_b2g3r3: return fbFetch_b2g3r3;
+    case PIXMAN_a2r2g2b2: return fbFetch_a2r2g2b2;
+    case PIXMAN_a2b2g2r2: return fbFetch_a2b2g2r2;
+    case PIXMAN_c8: return  fbFetch_c8;
+    case PIXMAN_g8: return  fbFetch_c8;
+    case PIXMAN_x4a4: return fbFetch_x4a4;
+       
+        /* 4bpp formats */
+    case PIXMAN_a4: return  fbFetch_a4;
+    case PIXMAN_r1g2b1: return fbFetch_r1g2b1;
+    case PIXMAN_b1g2r1: return fbFetch_b1g2r1;
+    case PIXMAN_a1r1g1b1: return fbFetch_a1r1g1b1;
+    case PIXMAN_a1b1g1r1: return fbFetch_a1b1g1r1;
+    case PIXMAN_c4: return  fbFetch_c4;
+    case PIXMAN_g4: return  fbFetch_c4;
+       
+        /* 1bpp formats */
+    case PIXMAN_a1: return  fbFetch_a1;
+    case PIXMAN_g1: return  fbFetch_g1;
+    }
+    
+    return NULL;
+}
+
+/*
+ * Pixel wise fetching
+ */
+
+typedef FASTCALL uint32_t (*fetchPixelProc)(const uint32_t *bits, int offset, pixman_indexed_t * indexed);
+
+static FASTCALL uint32_t
+fbFetchPixel_a8r8g8b8 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    return READ((uint32_t *)bits + offset);
+}
+
+static FASTCALL uint32_t
+fbFetchPixel_x8r8g8b8 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    return READ((uint32_t *)bits + offset) | 0xff000000;
+}
+
+static FASTCALL uint32_t
+fbFetchPixel_a8b8g8r8 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    uint32_t  pixel = READ((uint32_t *)bits + offset);
+    
+    return ((pixel & 0xff000000) |
+           ((pixel >> 16) & 0xff) |
+           (pixel & 0x0000ff00) |
+           ((pixel & 0xff) << 16));
+}
+
+static FASTCALL uint32_t
+fbFetchPixel_x8b8g8r8 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    uint32_t  pixel = READ((uint32_t *)bits + offset);
+    
+    return ((0xff000000) |
+           ((pixel >> 16) & 0xff) |
+           (pixel & 0x0000ff00) |
+           ((pixel & 0xff) << 16));
+}
+
+static FASTCALL uint32_t
+fbFetchPixel_r8g8b8 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    uint8_t   *pixel = ((uint8_t *) bits) + (offset*3);
+#if IMAGE_BYTE_ORDER == MSBFirst
+    return (0xff000000 |
+           (READ(pixel + 0) << 16) |
+           (READ(pixel + 1) << 8) |
+           (READ(pixel + 2)));
+#else
+    return (0xff000000 |
+            (READ(pixel + 2) << 16) |
+            (READ(pixel + 1) << 8) |
+            (READ(pixel + 0)));
+#endif
+}
+
+static FASTCALL uint32_t
+fbFetchPixel_b8g8r8 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    uint8_t   *pixel = ((uint8_t *) bits) + (offset*3);
+#if IMAGE_BYTE_ORDER == MSBFirst
+    return (0xff000000 |
+           (READ(pixel + 2) << 16) |
+           (READ(pixel + 1) << 8) |
+           (READ(pixel + 0)));
+#else
+    return (0xff000000 |
+           (READ(pixel + 0) << 16) |
+           (READ(pixel + 1) << 8) |
+           (READ(pixel + 2)));
+#endif
+}
+
+static FASTCALL uint32_t
+fbFetchPixel_r5g6b5 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    uint32_t  pixel = READ((uint16_t *) bits + offset);
+    uint32_t  r,g,b;
+    
+    r = ((pixel & 0xf800) | ((pixel & 0xe000) >> 5)) << 8;
+    g = ((pixel & 0x07e0) | ((pixel & 0x0600) >> 6)) << 5;
+    b = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) >> 2;
+    return (0xff000000 | r | g | b);
+}
+
+static FASTCALL uint32_t
+fbFetchPixel_b5g6r5 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    uint32_t  pixel = READ((uint16_t *) bits + offset);
+    uint32_t  r,g,b;
+    
+    b = ((pixel & 0xf800) | ((pixel & 0xe000) >> 5)) >> 8;
+    g = ((pixel & 0x07e0) | ((pixel & 0x0600) >> 6)) << 5;
+    r = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) << 14;
+    return (0xff000000 | r | g | b);
+}
+
+static FASTCALL uint32_t
+fbFetchPixel_a1r5g5b5 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    uint32_t  pixel = READ((uint16_t *) bits + offset);
+    uint32_t  a,r,g,b;
+    
+    a = (uint32_t) ((uint8_t) (0 - ((pixel & 0x8000) >> 15))) << 24;
+    r = ((pixel & 0x7c00) | ((pixel & 0x7000) >> 5)) << 9;
+    g = ((pixel & 0x03e0) | ((pixel & 0x0380) >> 5)) << 6;
+    b = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) >> 2;
+    return (a | r | g | b);
+}
+
+static FASTCALL uint32_t
+fbFetchPixel_x1r5g5b5 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    uint32_t  pixel = READ((uint16_t *) bits + offset);
+    uint32_t  r,g,b;
+    
+    r = ((pixel & 0x7c00) | ((pixel & 0x7000) >> 5)) << 9;
+    g = ((pixel & 0x03e0) | ((pixel & 0x0380) >> 5)) << 6;
+    b = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) >> 2;
+    return (0xff000000 | r | g | b);
+}
+
+static FASTCALL uint32_t
+fbFetchPixel_a1b5g5r5 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    uint32_t  pixel = READ((uint16_t *) bits + offset);
+    uint32_t  a,r,g,b;
+    
+    a = (uint32_t) ((uint8_t) (0 - ((pixel & 0x8000) >> 15))) << 24;
+    b = ((pixel & 0x7c00) | ((pixel & 0x7000) >> 5)) >> 7;
+    g = ((pixel & 0x03e0) | ((pixel & 0x0380) >> 5)) << 6;
+    r = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) << 14;
+    return (a | r | g | b);
+}
+
+static FASTCALL uint32_t
+fbFetchPixel_x1b5g5r5 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    uint32_t  pixel = READ((uint16_t *) bits + offset);
+    uint32_t  r,g,b;
+    
+    b = ((pixel & 0x7c00) | ((pixel & 0x7000) >> 5)) >> 7;
+    g = ((pixel & 0x03e0) | ((pixel & 0x0380) >> 5)) << 6;
+    r = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) << 14;
+    return (0xff000000 | r | g | b);
+}
+
+static FASTCALL uint32_t
+fbFetchPixel_a4r4g4b4 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    uint32_t  pixel = READ((uint16_t *) bits + offset);
+    uint32_t  a,r,g,b;
+    
+    a = ((pixel & 0xf000) | ((pixel & 0xf000) >> 4)) << 16;
+    r = ((pixel & 0x0f00) | ((pixel & 0x0f00) >> 4)) << 12;
+    g = ((pixel & 0x00f0) | ((pixel & 0x00f0) >> 4)) << 8;
+    b = ((pixel & 0x000f) | ((pixel & 0x000f) << 4));
+    return (a | r | g | b);
+}
+
+static FASTCALL uint32_t
+fbFetchPixel_x4r4g4b4 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    uint32_t  pixel = READ((uint16_t *) bits + offset);
+    uint32_t  r,g,b;
+    
+    r = ((pixel & 0x0f00) | ((pixel & 0x0f00) >> 4)) << 12;
+    g = ((pixel & 0x00f0) | ((pixel & 0x00f0) >> 4)) << 8;
+    b = ((pixel & 0x000f) | ((pixel & 0x000f) << 4));
+    return (0xff000000 | r | g | b);
+}
+
+static FASTCALL uint32_t
+fbFetchPixel_a4b4g4r4 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    uint32_t  pixel = READ((uint16_t *) bits + offset);
+    uint32_t  a,r,g,b;
+    
+    a = ((pixel & 0xf000) | ((pixel & 0xf000) >> 4)) << 16;
+    b = ((pixel & 0x0f00) | ((pixel & 0x0f00) >> 4)) >> 4;
+    g = ((pixel & 0x00f0) | ((pixel & 0x00f0) >> 4)) << 8;
+    r = ((pixel & 0x000f) | ((pixel & 0x000f) << 4)) << 16;
+    return (a | r | g | b);
+}
+
+static FASTCALL uint32_t
+fbFetchPixel_x4b4g4r4 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    uint32_t  pixel = READ((uint16_t *) bits + offset);
+    uint32_t  r,g,b;
+    
+    b = ((pixel & 0x0f00) | ((pixel & 0x0f00) >> 4)) >> 4;
+    g = ((pixel & 0x00f0) | ((pixel & 0x00f0) >> 4)) << 8;
+    r = ((pixel & 0x000f) | ((pixel & 0x000f) << 4)) << 16;
+    return (0xff000000 | r | g | b);
+}
+
+static FASTCALL uint32_t
+fbFetchPixel_a8 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    uint32_t   pixel = READ((uint8_t *) bits + offset);
+    
+    return pixel << 24;
+}
+
+static FASTCALL uint32_t
+fbFetchPixel_r3g3b2 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    uint32_t   pixel = READ((uint8_t *) bits + offset);
+    uint32_t  r,g,b;
+    
+    r = ((pixel & 0xe0) | ((pixel & 0xe0) >> 3) | ((pixel & 0xc0) >> 6)) << 16;
+    g = ((pixel & 0x1c) | ((pixel & 0x18) >> 3) | ((pixel & 0x1c) << 3)) << 8;
+    b = (((pixel & 0x03)     ) |
+        ((pixel & 0x03) << 2) |
+        ((pixel & 0x03) << 4) |
+        ((pixel & 0x03) << 6));
+    return (0xff000000 | r | g | b);
+}
+
+static FASTCALL uint32_t
+fbFetchPixel_b2g3r3 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    uint32_t   pixel = READ((uint8_t *) bits + offset);
+    uint32_t  r,g,b;
+    
+    b = (((pixel & 0xc0)     ) |
+        ((pixel & 0xc0) >> 2) |
+        ((pixel & 0xc0) >> 4) |
+        ((pixel & 0xc0) >> 6));
+    g = ((pixel & 0x38) | ((pixel & 0x38) >> 3) | ((pixel & 0x30) << 2)) << 8;
+    r = (((pixel & 0x07)     ) |
+        ((pixel & 0x07) << 3) |
+        ((pixel & 0x06) << 6)) << 16;
+    return (0xff000000 | r | g | b);
+}
+
+static FASTCALL uint32_t
+fbFetchPixel_a2r2g2b2 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    uint32_t   pixel = READ((uint8_t *) bits + offset);
+    uint32_t   a,r,g,b;
+    
+    a = ((pixel & 0xc0) * 0x55) << 18;
+    r = ((pixel & 0x30) * 0x55) << 12;
+    g = ((pixel & 0x0c) * 0x55) << 6;
+    b = ((pixel & 0x03) * 0x55);
+    return a|r|g|b;
+}
+
+static FASTCALL uint32_t
+fbFetchPixel_a2b2g2r2 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    uint32_t   pixel = READ((uint8_t *) bits + offset);
+    uint32_t   a,r,g,b;
+    
+    a = ((pixel & 0xc0) * 0x55) << 18;
+    b = ((pixel & 0x30) * 0x55) >> 6;
+    g = ((pixel & 0x0c) * 0x55) << 6;
+    r = ((pixel & 0x03) * 0x55) << 16;
+    return a|r|g|b;
+}
+
+static FASTCALL uint32_t
+fbFetchPixel_c8 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    uint32_t   pixel = READ((uint8_t *) bits + offset);
+    return indexed->rgba[pixel];
+}
+
+static FASTCALL uint32_t
+fbFetchPixel_x4a4 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    uint32_t   pixel = READ((uint8_t *) bits + offset);
+    
+    return ((pixel & 0xf) | ((pixel & 0xf) << 4)) << 24;
+}
+
+#define Fetch8(l,o)    (((uint8_t *) (l))[(o) >> 2])
+#if IMAGE_BYTE_ORDER == MSBFirst
+#define Fetch4(l,o)    ((o) & 2 ? Fetch8(l,o) & 0xf : Fetch8(l,o) >> 4)
+#else
+#define Fetch4(l,o)    ((o) & 2 ? Fetch8(l,o) >> 4 : Fetch8(l,o) & 0xf)
+#endif
+
+static FASTCALL uint32_t
+fbFetchPixel_a4 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    uint32_t  pixel = Fetch4(bits, offset);
+    
+    pixel |= pixel << 4;
+    return pixel << 24;
+}
+
+static FASTCALL uint32_t
+fbFetchPixel_r1g2b1 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    uint32_t  pixel = Fetch4(bits, offset);
+    uint32_t  r,g,b;
+    
+    r = ((pixel & 0x8) * 0xff) << 13;
+    g = ((pixel & 0x6) * 0x55) << 7;
+    b = ((pixel & 0x1) * 0xff);
+    return 0xff000000|r|g|b;
+}
+
+static FASTCALL uint32_t
+fbFetchPixel_b1g2r1 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    uint32_t  pixel = Fetch4(bits, offset);
+    uint32_t  r,g,b;
+    
+    b = ((pixel & 0x8) * 0xff) >> 3;
+    g = ((pixel & 0x6) * 0x55) << 7;
+    r = ((pixel & 0x1) * 0xff) << 16;
+    return 0xff000000|r|g|b;
+}
+
+static FASTCALL uint32_t
+fbFetchPixel_a1r1g1b1 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    uint32_t  pixel = Fetch4(bits, offset);
+    uint32_t  a,r,g,b;
+    
+    a = ((pixel & 0x8) * 0xff) << 21;
+    r = ((pixel & 0x4) * 0xff) << 14;
+    g = ((pixel & 0x2) * 0xff) << 7;
+    b = ((pixel & 0x1) * 0xff);
+    return a|r|g|b;
+}
+
+static FASTCALL uint32_t
+fbFetchPixel_a1b1g1r1 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    uint32_t  pixel = Fetch4(bits, offset);
+    uint32_t  a,r,g,b;
+    
+    a = ((pixel & 0x8) * 0xff) << 21;
+    r = ((pixel & 0x4) * 0xff) >> 3;
+    g = ((pixel & 0x2) * 0xff) << 7;
+    b = ((pixel & 0x1) * 0xff) << 16;
+    return a|r|g|b;
+}
+
+static FASTCALL uint32_t
+fbFetchPixel_c4 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    uint32_t  pixel = Fetch4(bits, offset);
+    
+    return indexed->rgba[pixel];
+}
+
+
+static FASTCALL uint32_t
+fbFetchPixel_a1 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    uint32_t  pixel = ((uint32_t *)bits)[offset >> 5];
+    uint32_t  a;
+#if BITMAP_BIT_ORDER == MSBFirst
+    a = pixel >> (0x1f - (offset & 0x1f));
+#else
+    a = pixel >> (offset & 0x1f);
+#endif
+    a = a & 1;
+    a |= a << 1;
+    a |= a << 2;
+    a |= a << 4;
+    return a << 24;
+}
+
+static FASTCALL uint32_t
+fbFetchPixel_g1 (const uint32_t *bits, int offset, pixman_indexed_t * indexed)
+{
+    uint32_t pixel = ((uint32_t *)bits)[offset >> 5];
+    uint32_t a;
+#if BITMAP_BIT_ORDER == MSBFirst
+    a = pixel >> (0x1f - (offset & 0x1f));
+#else
+    a = pixel >> (offset & 0x1f);
+#endif
+    a = a & 1;
+    return indexed->rgba[a];
+}
+
+static fetchPixelProc fetchPixelProcForPicture (bits_image_t * pict)
+{
+    switch(pict->format) {
+    case PIXMAN_a8r8g8b8: return fbFetchPixel_a8r8g8b8;
+    case PIXMAN_x8r8g8b8: return fbFetchPixel_x8r8g8b8;
+    case PIXMAN_a8b8g8r8: return fbFetchPixel_a8b8g8r8;
+    case PIXMAN_x8b8g8r8: return fbFetchPixel_x8b8g8r8;
+       
+        /* 24bpp formats */
+    case PIXMAN_r8g8b8: return fbFetchPixel_r8g8b8;
+    case PIXMAN_b8g8r8: return fbFetchPixel_b8g8r8;
+       
+        /* 16bpp formats */
+    case PIXMAN_r5g6b5: return fbFetchPixel_r5g6b5;
+    case PIXMAN_b5g6r5: return fbFetchPixel_b5g6r5;
+       
+    case PIXMAN_a1r5g5b5: return fbFetchPixel_a1r5g5b5;
+    case PIXMAN_x1r5g5b5: return fbFetchPixel_x1r5g5b5;
+    case PIXMAN_a1b5g5r5: return fbFetchPixel_a1b5g5r5;
+    case PIXMAN_x1b5g5r5: return fbFetchPixel_x1b5g5r5;
+    case PIXMAN_a4r4g4b4: return fbFetchPixel_a4r4g4b4;
+    case PIXMAN_x4r4g4b4: return fbFetchPixel_x4r4g4b4;
+    case PIXMAN_a4b4g4r4: return fbFetchPixel_a4b4g4r4;
+    case PIXMAN_x4b4g4r4: return fbFetchPixel_x4b4g4r4;
+       
+        /* 8bpp formats */
+    case PIXMAN_a8: return  fbFetchPixel_a8;
+    case PIXMAN_r3g3b2: return fbFetchPixel_r3g3b2;
+    case PIXMAN_b2g3r3: return fbFetchPixel_b2g3r3;
+    case PIXMAN_a2r2g2b2: return fbFetchPixel_a2r2g2b2;
+    case PIXMAN_a2b2g2r2: return fbFetchPixel_a2b2g2r2;
+    case PIXMAN_c8: return  fbFetchPixel_c8;
+    case PIXMAN_g8: return  fbFetchPixel_c8;
+    case PIXMAN_x4a4: return fbFetchPixel_x4a4;
+       
+        /* 4bpp formats */
+    case PIXMAN_a4: return  fbFetchPixel_a4;
+    case PIXMAN_r1g2b1: return fbFetchPixel_r1g2b1;
+    case PIXMAN_b1g2r1: return fbFetchPixel_b1g2r1;
+    case PIXMAN_a1r1g1b1: return fbFetchPixel_a1r1g1b1;
+    case PIXMAN_a1b1g1r1: return fbFetchPixel_a1b1g1r1;
+    case PIXMAN_c4: return  fbFetchPixel_c4;
+    case PIXMAN_g4: return  fbFetchPixel_c4;
+       
+        /* 1bpp formats */
+    case PIXMAN_a1: return  fbFetchPixel_a1;
+    case PIXMAN_g1: return  fbFetchPixel_g1;
+    }
+    
+    return NULL;
+}
+
+
+
+/*
+ * All the store functions
+ */
+
+typedef FASTCALL void (*storeProc) (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed);
+
+#define Splita(v)      uint32_t        a = ((v) >> 24), r = ((v) >> 16) & 0xff, g = ((v) >> 8) & 0xff, b = (v) & 0xff
+#define Split(v)       uint32_t        r = ((v) >> 16) & 0xff, g = ((v) >> 8) & 0xff, b = (v) & 0xff
+
+static FASTCALL void
+fbStore_a8r8g8b8 (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed)
+{
+    MEMCPY_WRAPPED(((uint32_t *)bits) + x, values, width*sizeof(uint32_t));
+}
+
+static FASTCALL void
+fbStore_x8r8g8b8 (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed)
+{
+    int i;
+    uint32_t *pixel = (uint32_t *)bits + x;
+    for (i = 0; i < width; ++i)
+        WRITE(pixel++, READ(values + i) & 0xffffff);
+}
+
+static FASTCALL void
+fbStore_a8b8g8r8 (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed)
+{
+    int i;
+    uint32_t *pixel = (uint32_t *)bits + x;
+    for (i = 0; i < width; ++i)
+        WRITE(pixel++, (READ(values + i) & 0xff00ff00) | ((READ(values + i) >> 16) & 0xff) | ((READ(values + i) & 0xff) << 16));
+}
+
+static FASTCALL void
+fbStore_x8b8g8r8 (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed)
+{
+    int i;
+    uint32_t *pixel = (uint32_t *)bits + x;
+    for (i = 0; i < width; ++i)
+        WRITE(pixel++, (READ(values + i) & 0x0000ff00) | ((READ(values + i) >> 16) & 0xff) | ((READ(values + i) & 0xff) << 16));
+}
+
+static FASTCALL void
+fbStore_r8g8b8 (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed)
+{
+    int i;
+    uint8_t *pixel = ((uint8_t *) bits) + 3*x;
+    for (i = 0; i < width; ++i) {
+        Store24(pixel, READ(values + i));
+        pixel += 3;
+    }
+}
+
+static FASTCALL void
+fbStore_b8g8r8 (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed)
+{
+    int i;
+    uint8_t *pixel = ((uint8_t *) bits) + 3*x;
+    for (i = 0; i < width; ++i) {
+        uint32_t val = READ(values + i);
+#if IMAGE_BYTE_ORDER == MSBFirst
+        WRITE(pixel++, Blue(val));
+        WRITE(pixel++, Green(val));
+        WRITE(pixel++, Red(val));
+#else
+        WRITE(pixel++, Red(val));
+        WRITE(pixel++, Green(val));
+        WRITE(pixel++, Blue(val));
+#endif
+    }
+}
+
+static FASTCALL void
+fbStore_r5g6b5 (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed)
+{
+    int i;
+    uint16_t *pixel = ((uint16_t *) bits) + x;
+    for (i = 0; i < width; ++i) {
+        uint32_t s = READ(values + i);
+        WRITE(pixel++, ((s >> 3) & 0x001f) |
+             ((s >> 5) & 0x07e0) |
+             ((s >> 8) & 0xf800));
+    }
+}
+
+static FASTCALL void
+fbStore_b5g6r5 (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed)
+{
+    int i;
+    uint16_t  *pixel = ((uint16_t *) bits) + x;
+    for (i = 0; i < width; ++i) {
+        Split(READ(values + i));
+        WRITE(pixel++, ((b << 8) & 0xf800) |
+             ((g << 3) & 0x07e0) |
+             ((r >> 3)         ));
+    }
+}
+
+static FASTCALL void
+fbStore_a1r5g5b5 (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed)
+{
+    int i;
+    uint16_t  *pixel = ((uint16_t *) bits) + x;
+    for (i = 0; i < width; ++i) {
+        Splita(READ(values + i));
+        WRITE(pixel++, ((a << 8) & 0x8000) |
+             ((r << 7) & 0x7c00) |
+             ((g << 2) & 0x03e0) |
+             ((b >> 3)         ));
+    }
+}
+
+static FASTCALL void
+fbStore_x1r5g5b5 (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed)
+{
+    int i;
+    uint16_t  *pixel = ((uint16_t *) bits) + x;
+    for (i = 0; i < width; ++i) {
+        Split(READ(values + i));
+        WRITE(pixel++, ((r << 7) & 0x7c00) |
+             ((g << 2) & 0x03e0) |
+             ((b >> 3)         ));
+    }
+}
+
+static FASTCALL void
+fbStore_a1b5g5r5 (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed)
+{
+    int i;
+    uint16_t  *pixel = ((uint16_t *) bits) + x;
+    for (i = 0; i < width; ++i) {
+        Splita(READ(values + i));
+        WRITE(pixel++, ((a << 8) & 0x8000) |
+             ((b << 7) & 0x7c00) |
+             ((g << 2) & 0x03e0) |
+             ((r >> 3)         ));
+    }
+}
+
+static FASTCALL void
+fbStore_x1b5g5r5 (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed)
+{
+    int i;
+    uint16_t  *pixel = ((uint16_t *) bits) + x;
+    for (i = 0; i < width; ++i) {
+        Split(READ(values + i));
+        WRITE(pixel++, ((b << 7) & 0x7c00) |
+             ((g << 2) & 0x03e0) |
+             ((r >> 3)         ));
+    }
+}
+
+static FASTCALL void
+fbStore_a4r4g4b4 (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed)
+{
+    int i;
+    uint16_t  *pixel = ((uint16_t *) bits) + x;
+    for (i = 0; i < width; ++i) {
+        Splita(READ(values + i));
+        WRITE(pixel++, ((a << 8) & 0xf000) |
+             ((r << 4) & 0x0f00) |
+             ((g     ) & 0x00f0) |
+             ((b >> 4)         ));
+    }
+}
+
+static FASTCALL void
+fbStore_x4r4g4b4 (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed)
+{
+    int i;
+    uint16_t  *pixel = ((uint16_t *) bits) + x;
+    for (i = 0; i < width; ++i) {
+        Split(READ(values + i));
+        WRITE(pixel++, ((r << 4) & 0x0f00) |
+             ((g     ) & 0x00f0) |
+             ((b >> 4)         ));
+    }
+}
+
+static FASTCALL void
+fbStore_a4b4g4r4 (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed)
+{
+    int i;
+    uint16_t  *pixel = ((uint16_t *) bits) + x;
+    for (i = 0; i < width; ++i) {
+        Splita(READ(values + i));
+        WRITE(pixel++, ((a << 8) & 0xf000) |
+             ((b << 4) & 0x0f00) |
+             ((g     ) & 0x00f0) |
+             ((r >> 4)         ));
+    }
+}
+
+static FASTCALL void
+fbStore_x4b4g4r4 (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed)
+{
+    int i;
+    uint16_t  *pixel = ((uint16_t *) bits) + x;
+    for (i = 0; i < width; ++i) {
+        Split(READ(values + i));
+        WRITE(pixel++, ((b << 4) & 0x0f00) |
+             ((g     ) & 0x00f0) |
+             ((r >> 4)         ));
+    }
+}
+
+static FASTCALL void
+fbStore_a8 (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed)
+{
+    int i;
+    uint8_t   *pixel = ((uint8_t *) bits) + x;
+    for (i = 0; i < width; ++i) {
+        WRITE(pixel++, READ(values + i) >> 24);
+    }
+}
+
+static FASTCALL void
+fbStore_r3g3b2 (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed)
+{
+    int i;
+    uint8_t   *pixel = ((uint8_t *) bits) + x;
+    for (i = 0; i < width; ++i) {
+        Split(READ(values + i));
+        WRITE(pixel++, ((r     ) & 0xe0) |
+             ((g >> 3) & 0x1c) |
+             ((b >> 6)       ));
+    }
+}
+
+static FASTCALL void
+fbStore_b2g3r3 (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed)
+{
+    int i;
+    uint8_t   *pixel = ((uint8_t *) bits) + x;
+    for (i = 0; i < width; ++i) {
+        Split(READ(values + i));
+        WRITE(pixel++, ((b     ) & 0xe0) |
+             ((g >> 3) & 0x1c) |
+             ((r >> 6)       ));
+    }
+}
+
+static FASTCALL void
+fbStore_a2r2g2b2 (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed)
+{
+    int i;
+    uint8_t   *pixel = ((uint8_t *) bits) + x;
+    for (i = 0; i < width; ++i) {
+        Splita(READ(values + i));
+        WRITE(pixel++, ((a     ) & 0xc0) |
+             ((r >> 2) & 0x30) |
+             ((g >> 4) & 0x0c) |
+             ((b >> 6)       ));
+    }
+}
+
+static FASTCALL void
+fbStore_c8 (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed)
+{
+    int i;
+    uint8_t   *pixel = ((uint8_t *) bits) + x;
+    for (i = 0; i < width; ++i) {
+        WRITE(pixel++, miIndexToEnt24(indexed,READ(values + i)));
+    }
+}
+
+static FASTCALL void
+fbStore_x4a4 (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed)
+{
+    int i;
+    uint8_t   *pixel = ((uint8_t *) bits) + x;
+    for (i = 0; i < width; ++i) {
+        WRITE(pixel++, READ(values + i) >> 28);
+    }
+}
+
+#define Store8(l,o,v)  (((uint8_t *) l)[(o) >> 3] = (v))
+#if IMAGE_BYTE_ORDER == MSBFirst
+#define Store4(l,o,v)  Store8(l,o,((o) & 4 ?                           \
+                                  (Fetch8(l,o) & 0xf0) | (v) :         \
+                                  (Fetch8(l,o) & 0x0f) | ((v) << 4)))
+#else
+#define Store4(l,o,v)  Store8(l,o,((o) & 4 ?                          \
+                                  (Fetch8(l,o) & 0x0f) | ((v) << 4) : \
+                                  (Fetch8(l,o) & 0xf0) | (v)))
+#endif
+
+static FASTCALL void
+fbStore_a4 (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        Store4(bits, i + x, READ(values + i)>>28);
+    }
+}
+
+static FASTCALL void
+fbStore_r1g2b1 (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        uint32_t  pixel;
+       
+        Split(READ(values + i));
+        pixel = (((r >> 4) & 0x8) |
+                 ((g >> 5) & 0x6) |
+                 ((b >> 7)      ));
+        Store4(bits, i + x, pixel);
+    }
+}
+
+static FASTCALL void
+fbStore_b1g2r1 (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        uint32_t  pixel;
+       
+        Split(READ(values + i));
+        pixel = (((b >> 4) & 0x8) |
+                 ((g >> 5) & 0x6) |
+                 ((r >> 7)      ));
+        Store4(bits, i + x, pixel);
+    }
+}
+
+static FASTCALL void
+fbStore_a1r1g1b1 (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        uint32_t  pixel;
+        Splita(READ(values + i));
+        pixel = (((a >> 4) & 0x8) |
+                 ((r >> 5) & 0x4) |
+                 ((g >> 6) & 0x2) |
+                 ((b >> 7)      ));
+        Store4(bits, i + x, pixel);
+    }
+}
+
+static FASTCALL void
+fbStore_a1b1g1r1 (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        uint32_t  pixel;
+        Splita(READ(values + i));
+        pixel = (((a >> 4) & 0x8) |
+                 ((b >> 5) & 0x4) |
+                 ((g >> 6) & 0x2) |
+                 ((r >> 7)      ));
+        Store4(bits, i + x, pixel);
+    }
+}
+
+static FASTCALL void
+fbStore_c4 (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        uint32_t  pixel;
+       
+        pixel = miIndexToEnt24(indexed, READ(values + i));
+        Store4(bits, i + x, pixel);
+    }
+}
+
+#define LOG2_BITMAP_PAD 5
+#define FB_STIP_SHIFT  LOG2_BITMAP_PAD
+#define FB_STIP_UNIT   (1 << FB_STIP_SHIFT)
+#define FB_STIP_MASK   (FB_STIP_UNIT - 1)
+#define FB_STIP_ALLONES        ((uint32_t) -1)
+
+#if BITMAP_BIT_ORDER == LSBFirst
+#define FbScrLeft(x,n) ((x) >> (n))
+#define FbScrRight(x,n)        ((x) << (n))
+#else
+#define FbScrLeft(x,n) ((x) << (n))
+#define FbScrRight(x,n)        ((x) >> (n))
+#endif
+#define FbStipLeft(x,n)        FbScrLeft(x,n)
+#define FbStipRight(x,n) FbScrRight(x,n)
+#define FbStipMask(x,w)        (FbStipRight(FB_STIP_ALLONES,(x) & FB_STIP_MASK) & \
+                        FbStipLeft(FB_STIP_ALLONES,(FB_STIP_UNIT - ((x)+(w))) & FB_STIP_MASK))
+
+static FASTCALL void
+fbStore_a1 (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        uint32_t  *pixel = ((uint32_t *) bits) + ((i+x) >> 5);
+        uint32_t  mask = FbStipMask((i+x) & 0x1f, 1);
+       
+        uint32_t v = READ(values + i) & 0x80000000 ? mask : 0;
+        WRITE(pixel, (READ(pixel) & ~mask) | v);
+    }
+}
+
+static FASTCALL void
+fbStore_g1 (uint32_t *bits, const uint32_t *values, int x, int width, pixman_indexed_t * indexed)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        uint32_t  *pixel = ((uint32_t *) bits) + ((i+x) >> 5);
+        uint32_t  mask = FbStipMask((i+x) & 0x1f, 1);
+       
+        uint32_t v = miIndexToEntY24(indexed,READ(values + i)) ? mask : 0;
+        WRITE(pixel, (READ(pixel) & ~mask) | v);
+    }
+}
+
+
+static storeProc storeProcForPicture (bits_image_t * pict)
+{
+    switch(pict->format) {
+    case PIXMAN_a8r8g8b8: return fbStore_a8r8g8b8;
+    case PIXMAN_x8r8g8b8: return fbStore_x8r8g8b8;
+    case PIXMAN_a8b8g8r8: return fbStore_a8b8g8r8;
+    case PIXMAN_x8b8g8r8: return fbStore_x8b8g8r8;
+       
+        /* 24bpp formats */
+    case PIXMAN_r8g8b8: return fbStore_r8g8b8;
+    case PIXMAN_b8g8r8: return fbStore_b8g8r8;
+       
+        /* 16bpp formats */
+    case PIXMAN_r5g6b5: return fbStore_r5g6b5;
+    case PIXMAN_b5g6r5: return fbStore_b5g6r5;
+       
+    case PIXMAN_a1r5g5b5: return fbStore_a1r5g5b5;
+    case PIXMAN_x1r5g5b5: return fbStore_x1r5g5b5;
+    case PIXMAN_a1b5g5r5: return fbStore_a1b5g5r5;
+    case PIXMAN_x1b5g5r5: return fbStore_x1b5g5r5;
+    case PIXMAN_a4r4g4b4: return fbStore_a4r4g4b4;
+    case PIXMAN_x4r4g4b4: return fbStore_x4r4g4b4;
+    case PIXMAN_a4b4g4r4: return fbStore_a4b4g4r4;
+    case PIXMAN_x4b4g4r4: return fbStore_x4b4g4r4;
+       
+        /* 8bpp formats */
+    case PIXMAN_a8: return  fbStore_a8;
+    case PIXMAN_r3g3b2: return fbStore_r3g3b2;
+    case PIXMAN_b2g3r3: return fbStore_b2g3r3;
+    case PIXMAN_a2r2g2b2: return fbStore_a2r2g2b2;
+    case PIXMAN_c8: return  fbStore_c8;
+    case PIXMAN_g8: return  fbStore_c8;
+    case PIXMAN_x4a4: return fbStore_x4a4;
+       
+        /* 4bpp formats */
+    case PIXMAN_a4: return  fbStore_a4;
+    case PIXMAN_r1g2b1: return fbStore_r1g2b1;
+    case PIXMAN_b1g2r1: return fbStore_b1g2r1;
+    case PIXMAN_a1r1g1b1: return fbStore_a1r1g1b1;
+    case PIXMAN_a1b1g1r1: return fbStore_a1b1g1r1;
+    case PIXMAN_c4: return  fbStore_c4;
+    case PIXMAN_g4: return  fbStore_c4;
+       
+        /* 1bpp formats */
+    case PIXMAN_a1: return  fbStore_a1;
+    case PIXMAN_g1: return  fbStore_g1;
+    default:
+        return NULL;
+    }
+}
+
+
+/*
+ * Combine src and mask
+ */
+static FASTCALL void
+fbCombineMaskU (uint32_t *src, const uint32_t *mask, int width)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        uint32_t a = READ(mask + i) >> 24;
+        uint32_t s = READ(src + i);
+        FbByteMul(s, a);
+        WRITE(src + i, s);
+    }
+}
+
+/*
+ * All of the composing functions
+ */
+
+static FASTCALL void
+fbCombineClear (uint32_t *dest, const uint32_t *src, int width)
+{
+    MEMSET_WRAPPED(dest, 0, width*sizeof(uint32_t));
+}
+
+static FASTCALL void
+fbCombineSrcU (uint32_t *dest, const uint32_t *src, int width)
+{
+    MEMCPY_WRAPPED(dest, src, width*sizeof(uint32_t));
+}
+
+
+static FASTCALL void
+fbCombineOverU (uint32_t *dest, const uint32_t *src, int width)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        uint32_t s = READ(src + i);
+        uint32_t d = READ(dest + i);
+        uint32_t ia = Alpha(~s);
+       
+        FbByteMulAdd(d, ia, s);
+        WRITE(dest + i, d);
+    }
+}
+
+static FASTCALL void
+fbCombineOverReverseU (uint32_t *dest, const uint32_t *src, int width)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        uint32_t s = READ(src + i);
+        uint32_t d = READ(dest + i);
+        uint32_t ia = Alpha(~READ(dest + i));
+        FbByteMulAdd(s, ia, d);
+        WRITE(dest + i, s);
+    }
+}
+
+static FASTCALL void
+fbCombineInU (uint32_t *dest, const uint32_t *src, int width)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        uint32_t s = READ(src + i);
+        uint32_t a = Alpha(READ(dest + i));
+        FbByteMul(s, a);
+        WRITE(dest + i, s);
+    }
+}
+
+static FASTCALL void
+fbCombineInReverseU (uint32_t *dest, const uint32_t *src, int width)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        uint32_t d = READ(dest + i);
+        uint32_t a = Alpha(READ(src + i));
+        FbByteMul(d, a);
+        WRITE(dest + i, d);
+    }
+}
+
+static FASTCALL void
+fbCombineOutU (uint32_t *dest, const uint32_t *src, int width)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        uint32_t s = READ(src + i);
+        uint32_t a = Alpha(~READ(dest + i));
+        FbByteMul(s, a);
+        WRITE(dest + i, s);
+    }
+}
+
+static FASTCALL void
+fbCombineOutReverseU (uint32_t *dest, const uint32_t *src, int width)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        uint32_t d = READ(dest + i);
+        uint32_t a = Alpha(~READ(src + i));
+        FbByteMul(d, a);
+        WRITE(dest + i, d);
+    }
+}
+
+static FASTCALL void
+fbCombineAtopU (uint32_t *dest, const uint32_t *src, int width)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        uint32_t s = READ(src + i);
+        uint32_t d = READ(dest + i);
+        uint32_t dest_a = Alpha(d);
+        uint32_t src_ia = Alpha(~s);
+       
+        FbByteAddMul(s, dest_a, d, src_ia);
+        WRITE(dest + i, s);
+    }
+}
+
+static FASTCALL void
+fbCombineAtopReverseU (uint32_t *dest, const uint32_t *src, int width)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        uint32_t s = READ(src + i);
+        uint32_t d = READ(dest + i);
+        uint32_t src_a = Alpha(s);
+        uint32_t dest_ia = Alpha(~d);
+       
+        FbByteAddMul(s, dest_ia, d, src_a);
+        WRITE(dest + i, s);
+    }
+}
+
+static FASTCALL void
+fbCombineXorU (uint32_t *dest, const uint32_t *src, int width)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        uint32_t s = READ(src + i);
+        uint32_t d = READ(dest + i);
+        uint32_t src_ia = Alpha(~s);
+        uint32_t dest_ia = Alpha(~d);
+       
+        FbByteAddMul(s, dest_ia, d, src_ia);
+        WRITE(dest + i, s);
+    }
+}
+
+static FASTCALL void
+fbCombineAddU (uint32_t *dest, const uint32_t *src, int width)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        uint32_t s = READ(src + i);
+        uint32_t d = READ(dest + i);
+        FbByteAdd(d, s);
+        WRITE(dest + i, d);
+    }
+}
+
+static FASTCALL void
+fbCombineSaturateU (uint32_t *dest, const uint32_t *src, int width)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        uint32_t  s = READ(src + i);
+        uint32_t d = READ(dest + i);
+        uint16_t  sa, da;
+       
+        sa = s >> 24;
+        da = ~d >> 24;
+        if (sa > da)
+        {
+            sa = FbIntDiv(da, sa);
+            FbByteMul(s, sa);
+        }
+        FbByteAdd(d, s);
+        WRITE(dest + i, d);
+    }
+}
+
+/*
+ * All of the disjoint composing functions
+ The four entries in the first column indicate what source contributions
+ come from each of the four areas of the picture -- areas covered by neither
+ A nor B, areas covered only by A, areas covered only by B and finally
+ areas covered by both A and B.
+ Disjoint                      Conjoint
+ Fa            Fb              Fa              Fb
+ (0,0,0,0)     0               0               0               0
+ (0,A,0,A)     1               0               1               0
+ (0,0,B,B)     0               1               0               1
+ (0,A,B,A)     1               min((1-a)/b,1)  1               max(1-a/b,0)
+ (0,A,B,B)     min((1-b)/a,1)  1               max(1-b/a,0)    1
+ (0,0,0,A)     max(1-(1-b)/a,0) 0              min(1,b/a)      0
+ (0,0,0,B)     0               max(1-(1-a)/b,0) 0              min(a/b,1)
+ (0,A,0,0)     min(1,(1-b)/a)  0               max(1-b/a,0)    0
+ (0,0,B,0)     0               min(1,(1-a)/b)  0               max(1-a/b,0)
+ (0,0,B,A)     max(1-(1-b)/a,0) min(1,(1-a)/b)  min(1,b/a)     max(1-a/b,0)
+ (0,A,0,B)     min(1,(1-b)/a)  max(1-(1-a)/b,0) max(1-b/a,0)   min(1,a/b)
+ (0,A,B,0)     min(1,(1-b)/a)  min(1,(1-a)/b)  max(1-b/a,0)    max(1-a/b,0)
+*/
+
+#define CombineAOut 1
+#define CombineAIn  2
+#define CombineBOut 4
+#define CombineBIn  8
+
+#define CombineClear   0
+#define CombineA       (CombineAOut|CombineAIn)
+#define CombineB       (CombineBOut|CombineBIn)
+#define CombineAOver   (CombineAOut|CombineBOut|CombineAIn)
+#define CombineBOver   (CombineAOut|CombineBOut|CombineBIn)
+#define CombineAAtop   (CombineBOut|CombineAIn)
+#define CombineBAtop   (CombineAOut|CombineBIn)
+#define CombineXor     (CombineAOut|CombineBOut)
+
+/* portion covered by a but not b */
+static INLINE uint8_t
+fbCombineDisjointOutPart (uint8_t a, uint8_t b)
+{
+    /* min (1, (1-b) / a) */
+    
+    b = ~b;                /* 1 - b */
+    if (b >= a)                    /* 1 - b >= a -> (1-b)/a >= 1 */
+       return 0xff;        /* 1 */
+    return FbIntDiv(b,a);   /* (1-b) / a */
+}
+
+/* portion covered by both a and b */
+static INLINE uint8_t
+fbCombineDisjointInPart (uint8_t a, uint8_t b)
+{
+    /* max (1-(1-b)/a,0) */
+    /*  = - min ((1-b)/a - 1, 0) */
+    /*  = 1 - min (1, (1-b)/a) */
+    
+    b = ~b;                /* 1 - b */
+    if (b >= a)                    /* 1 - b >= a -> (1-b)/a >= 1 */
+       return 0;           /* 1 - 1 */
+    return ~FbIntDiv(b,a);  /* 1 - (1-b) / a */
+}
+
+static FASTCALL void
+fbCombineDisjointGeneralU (uint32_t *dest, const uint32_t *src, int width, uint8_t combine)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        uint32_t s = READ(src + i);
+        uint32_t d = READ(dest + i);
+        uint32_t m,n,o,p;
+        uint16_t Fa, Fb, t, u, v;
+        uint8_t sa = s >> 24;
+        uint8_t da = d >> 24;
+       
+        switch (combine & CombineA) {
+        default:
+            Fa = 0;
+            break;
+        case CombineAOut:
+            Fa = fbCombineDisjointOutPart (sa, da);
+            break;
+        case CombineAIn:
+            Fa = fbCombineDisjointInPart (sa, da);
+            break;
+        case CombineA:
+            Fa = 0xff;
+            break;
+        }
+       
+        switch (combine & CombineB) {
+        default:
+            Fb = 0;
+            break;
+        case CombineBOut:
+            Fb = fbCombineDisjointOutPart (da, sa);
+            break;
+        case CombineBIn:
+            Fb = fbCombineDisjointInPart (da, sa);
+            break;
+        case CombineB:
+            Fb = 0xff;
+            break;
+        }
+        m = FbGen (s,d,0,Fa,Fb,t, u, v);
+        n = FbGen (s,d,8,Fa,Fb,t, u, v);
+        o = FbGen (s,d,16,Fa,Fb,t, u, v);
+        p = FbGen (s,d,24,Fa,Fb,t, u, v);
+        s = m|n|o|p;
+        WRITE(dest + i, s);
+    }
+}
+
+static FASTCALL void
+fbCombineDisjointOverU (uint32_t *dest, const uint32_t *src, int width)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        uint32_t  s = READ(src + i);
+        uint16_t  a = s >> 24;
+       
+        if (a != 0x00)
+        {
+            if (a != 0xff)
+            {
+                uint32_t d = READ(dest + i);
+                a = fbCombineDisjointOutPart (d >> 24, a);
+                FbByteMulAdd(d, a, s);
+                s = d;
+            }
+            WRITE(dest + i, s);
+        }
+    }
+}
+
+static FASTCALL void
+fbCombineDisjointInU (uint32_t *dest, const uint32_t *src, int width)
+{
+    fbCombineDisjointGeneralU (dest, src, width, CombineAIn);
+}
+
+static FASTCALL void
+fbCombineDisjointInReverseU (uint32_t *dest, const uint32_t *src, int width)
+{
+    fbCombineDisjointGeneralU (dest, src, width, CombineBIn);
+}
+
+static FASTCALL void
+fbCombineDisjointOutU (uint32_t *dest, const uint32_t *src, int width)
+{
+    fbCombineDisjointGeneralU (dest, src, width, CombineAOut);
+}
+
+static FASTCALL void
+fbCombineDisjointOutReverseU (uint32_t *dest, const uint32_t *src, int width)
+{
+    fbCombineDisjointGeneralU (dest, src, width, CombineBOut);
+}
+
+static FASTCALL void
+fbCombineDisjointAtopU (uint32_t *dest, const uint32_t *src, int width)
+{
+    fbCombineDisjointGeneralU (dest, src, width, CombineAAtop);
+}
+
+static FASTCALL void
+fbCombineDisjointAtopReverseU (uint32_t *dest, const uint32_t *src, int width)
+{
+    fbCombineDisjointGeneralU (dest, src, width, CombineBAtop);
+}
+
+static FASTCALL void
+fbCombineDisjointXorU (uint32_t *dest, const uint32_t *src, int width)
+{
+    fbCombineDisjointGeneralU (dest, src, width, CombineXor);
+}
+
+/* portion covered by a but not b */
+static INLINE uint8_t
+fbCombineConjointOutPart (uint8_t a, uint8_t b)
+{
+    /* max (1-b/a,0) */
+    /* = 1-min(b/a,1) */
+    
+    /* min (1, (1-b) / a) */
+    
+    if (b >= a)                    /* b >= a -> b/a >= 1 */
+       return 0x00;        /* 0 */
+    return ~FbIntDiv(b,a);   /* 1 - b/a */
+}
+
+/* portion covered by both a and b */
+static INLINE uint8_t
+fbCombineConjointInPart (uint8_t a, uint8_t b)
+{
+    /* min (1,b/a) */
+    
+    if (b >= a)                    /* b >= a -> b/a >= 1 */
+       return 0xff;        /* 1 */
+    return FbIntDiv(b,a);   /* b/a */
+}
+
+static FASTCALL void
+fbCombineConjointGeneralU (uint32_t *dest, const uint32_t *src, int width, uint8_t combine)
+{
+    int i;
+    for (i = 0; i < width; ++i) {
+        uint32_t  s = READ(src + i);
+        uint32_t d = READ(dest + i);
+        uint32_t  m,n,o,p;
+        uint16_t  Fa, Fb, t, u, v;
+        uint8_t sa = s >> 24;
+        uint8_t da = d >> 24;
+       
+        switch (combine & CombineA) {
+        default:
+            Fa = 0;
+            break;
+        case CombineAOut:
+            Fa = fbCombineConjointOutPart (sa, da);
+            break;
+        case CombineAIn:
+            Fa = fbCombineConjointInPart (sa, da);
+            break;
+        case CombineA:
+            Fa = 0xff;
+            break;
+        }
+       
+        switch (combine & CombineB) {
+        default:
+            Fb = 0;
+            break;
+        case CombineBOut:
+            Fb = fbCombineConjointOutPart (da, sa);
+            break;
+        case CombineBIn:
+            Fb = fbCombineConjointInPart (da, sa);
+            break;
+        case CombineB:
+            Fb = 0xff;
+            break;
+        }
+        m = FbGen (s,d,0,Fa,Fb,t, u, v);
+        n = FbGen (s,d,8,Fa,Fb,t, u, v);
+        o = FbGen (s,d,16,Fa,Fb,t, u, v);
+        p = FbGen (s,d,24,Fa,Fb,t, u, v);
+        s = m|n|o|p;
+        WRITE(dest + i, s);
+    }
+}
+
+static FASTCALL void
+fbCombineConjointOverU (uint32_t *dest, const uint32_t *src, int width)
+{
+    fbCombineConjointGeneralU (dest, src, width, CombineAOver);
+}
+
+
+static FASTCALL void
+fbCombineConjointOverReverseU (uint32_t *dest, const uint32_t *src, int width)
+{
+    fbCombineConjointGeneralU (dest, src, width, CombineBOver);
+}
+
+
+static FASTCALL void
+fbCombineConjointInU (uint32_t *dest, const uint32_t *src, int width)
+{
+    fbCombineConjointGeneralU (dest, src, width, CombineAIn);
+}
+
+
+static FASTCALL void
+fbCombineConjointInReverseU (uint32_t *dest, const uint32_t *src, int width)
+{
+    fbCombineConjointGeneralU (dest, src, width, CombineBIn);
+}
+
+static FASTCALL void
+fbCombineConjointOutU (uint32_t *dest, const uint32_t *src, int width)
+{
+    fbCombineConjointGeneralU (dest, src, width, CombineAOut);
+}
+
+static FASTCALL void
+fbCombineConjointOutReverseU (uint32_t *dest, const uint32_t *src, int width)
+{
+    fbCombineConjointGeneralU (dest, src, width, CombineBOut);
+}
+
+static FASTCALL void
+fbCombineConjointAtopU (uint32_t *dest, const uint32_t *src, int width)
+{
+    fbCombineConjointGeneralU (dest, src, width, CombineAAtop);
+}
+
+static FASTCALL void
+fbCombineConjointAtopReverseU (uint32_t *dest, const uint32_t *src, int width)
+{
+    fbCombineConjointGeneralU (dest, src, width, CombineBAtop);
+}
+
+static FASTCALL void
+fbCombineConjointXorU (uint32_t *dest, const uint32_t *src, int width)
+{
+    fbCombineConjointGeneralU (dest, src, width, CombineXor);
+}
+
+static CombineFuncU fbCombineFuncU[] = {
+    fbCombineClear,
+    fbCombineSrcU,
+    NULL, /* CombineDst */
+    fbCombineOverU,
+    fbCombineOverReverseU,
+    fbCombineInU,
+    fbCombineInReverseU,
+    fbCombineOutU,
+    fbCombineOutReverseU,
+    fbCombineAtopU,
+    fbCombineAtopReverseU,
+    fbCombineXorU,
+    fbCombineAddU,
+    fbCombineSaturateU,
+    NULL,
+    NULL,
+    fbCombineClear,
+    fbCombineSrcU,
+    NULL, /* CombineDst */
+    fbCombineDisjointOverU,
+    fbCombineSaturateU, /* DisjointOverReverse */
+    fbCombineDisjointInU,
+    fbCombineDisjointInReverseU,
+    fbCombineDisjointOutU,
+    fbCombineDisjointOutReverseU,
+    fbCombineDisjointAtopU,
+    fbCombineDisjointAtopReverseU,
+    fbCombineDisjointXorU,
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    fbCombineClear,
+    fbCombineSrcU,
+    NULL, /* CombineDst */
+    fbCombineConjointOverU,
+    fbCombineConjointOverReverseU,
+    fbCombineConjointInU,
+    fbCombineConjointInReverseU,
+    fbCombineConjointOutU,
+    fbCombineConjointOutReverseU,
+    fbCombineConjointAtopU,
+    fbCombineConjointAtopReverseU,
+    fbCombineConjointXorU,
+};
+
+static INLINE void
+fbCombineMaskC (uint32_t *src, uint32_t *mask)
+{
+    uint32_t a = *mask;
+    
+    uint32_t   x;
+    uint16_t   xa;
+    
+    if (!a)
+    {
+       WRITE(src, 0);
+       return;
+    }
+    
+    x = READ(src);
+    if (a == 0xffffffff)
+    {
+       x = x >> 24;
+       x |= x << 8;
+       x |= x << 16;
+       WRITE(mask, x);
+       return;
+    }
+    
+    xa = x >> 24;
+    FbByteMulC(x, a);
+    WRITE(src, x);
+    FbByteMul(a, xa);
+    WRITE(mask, a);
+}
+
+static INLINE void
+fbCombineMaskValueC (uint32_t *src, const uint32_t *mask)
+{
+    uint32_t a = READ(mask);
+    uint32_t   x;
+    
+    if (!a)
+    {
+       WRITE(src, 0);
+       return;
+    }
+    
+    if (a == 0xffffffff)
+       return;
+    
+    x = READ(src);
+    FbByteMulC(x, a);
+    WRITE(src,x);
+}
+
+static INLINE void
+fbCombineMaskAlphaC (const uint32_t *src, uint32_t *mask)
+{
+    uint32_t a = READ(mask);
+    uint32_t   x;
+    
+    if (!a)
+       return;
+    
+    x = READ(src) >> 24;
+    if (x == 0xff)
+       return;
+    if (a == 0xffffffff)
+    {
+       x = x >> 24;
+       x |= x << 8;
+       x |= x << 16;
+       WRITE(mask, x);
+       return;
+    }
+    
+    FbByteMul(a, x);
+    WRITE(mask, a);
+}
+
+static FASTCALL void
+fbCombineClearC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+    MEMSET_WRAPPED(dest, 0, width*sizeof(uint32_t));
+}
+
+static FASTCALL void
+fbCombineSrcC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+    int i;
+    
+    for (i = 0; i < width; ++i) {
+       uint32_t s = READ(src + i);
+       uint32_t m = READ(mask + i);
+       
+       fbCombineMaskValueC (&s, &m);
+       
+       WRITE(dest, s);
+    }
+}
+
+static FASTCALL void
+fbCombineOverC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+    int i;
+    
+    for (i = 0; i < width; ++i) {
+       uint32_t s = READ(src + i);
+       uint32_t m = READ(mask + i);
+       uint32_t a;
+       
+       fbCombineMaskC (&s, &m);
+       
+       a = ~m;
+        if (a != 0xffffffff)
+        {
+            if (a)
+            {
+                uint32_t d = READ(dest + i);
+                FbByteMulAddC(d, a, s);
+                s = d;
+            }
+            WRITE(dest + i, s);
+        }
+    }
+}
+
+static FASTCALL void
+fbCombineOverReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+    int i;
+    
+    for (i = 0; i < width; ++i) {
+        uint32_t d = READ(dest + i);
+        uint32_t a = ~d >> 24;
+       
+        if (a)
+        {
+            uint32_t s = READ(src + i);
+           uint32_t m = READ(mask + i);
+           
+           fbCombineMaskValueC (&s, &m);
+           
+            if (a != 0xff)
+            {
+                FbByteMulAdd(s, a, d);
+            }
+            WRITE(dest + i, s);
+        }
+    }
+}
+
+static FASTCALL void
+fbCombineInC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+    int i;
+    
+    for (i = 0; i < width; ++i) {
+        uint32_t d = READ(dest + i);
+        uint16_t a = d >> 24;
+        uint32_t s = 0;
+        if (a)
+        {
+           uint32_t m = READ(mask + i);
+           
+           s = READ(src + i);
+           fbCombineMaskValueC (&s, &m);
+            if (a != 0xff)
+            {
+                FbByteMul(s, a);
+            }
+        }
+        WRITE(dest + i, s);
+    }
+}
+
+static FASTCALL void
+fbCombineInReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+    int i;
+    
+    for (i = 0; i < width; ++i) {
+        uint32_t s = READ(src + i);
+        uint32_t m = READ(mask + i);
+        uint32_t a;
+       
+       fbCombineMaskAlphaC (&s, &m);
+       
+       a = m;
+        if (a != 0xffffffff)
+        {
+            uint32_t d = 0;
+            if (a)
+            {
+                d = READ(dest + i);
+                FbByteMulC(d, a);
+            }
+            WRITE(dest + i, d); 
+        }
+    }
+}
+
+static FASTCALL void
+fbCombineOutC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+    int i;
+    
+    for (i = 0; i < width; ++i) {
+        uint32_t d = READ(dest + i);
+        uint16_t a = ~d >> 24;
+        uint32_t s = 0;
+        if (a)
+        {
+           uint32_t m = READ(mask + i);
+           
+           s = READ(src + i);
+           fbCombineMaskValueC (&s, &m);
+           
+            if (a != 0xff)
+            {
+                FbByteMul(s, a);
+            }
+        }
+        WRITE(dest + i, s);
+    }
+}
+
+static FASTCALL void
+fbCombineOutReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+    int i;
+    
+    for (i = 0; i < width; ++i) {
+       uint32_t s = READ(src + i);
+       uint32_t m = READ(mask + i);
+       uint32_t a;
+       
+       fbCombineMaskAlphaC (&s, &m);
+       
+        a = ~m;
+        if (a != 0xffffffff)
+        {
+            uint32_t d = 0;
+            if (a)
+            {
+                d = READ(dest + i);
+                FbByteMulC(d, a);
+            }
+            WRITE(dest + i, d);
+        }
+    }
+}
+
+static FASTCALL void
+fbCombineAtopC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+    int i;
+    
+    for (i = 0; i < width; ++i) {
+        uint32_t d = READ(dest + i);
+        uint32_t s = READ(src + i);
+        uint32_t m = READ(mask + i);
+        uint32_t ad;
+        uint16_t as = d >> 24;
+       
+       fbCombineMaskC (&s, &m);
+       
+        ad = ~m;
+       
+        FbByteAddMulC(d, ad, s, as);
+        WRITE(dest + i, d);
+    }
+}
+
+static FASTCALL void
+fbCombineAtopReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+    int i;
+    
+    for (i = 0; i < width; ++i) {
+       
+        uint32_t d = READ(dest + i);
+        uint32_t s = READ(src + i);
+        uint32_t m = READ(mask + i);
+        uint32_t ad;
+        uint16_t as = ~d >> 24;
+       
+       fbCombineMaskC (&s, &m);
+       
+       ad = m;
+       
+        FbByteAddMulC(d, ad, s, as);
+        WRITE(dest + i, d);
+    }
+}
+
+static FASTCALL void
+fbCombineXorC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+    int i;
+    
+    for (i = 0; i < width; ++i) {
+        uint32_t d = READ(dest + i);
+        uint32_t s = READ(src + i);
+        uint32_t m = READ(mask + i);
+        uint32_t ad;
+        uint16_t as = ~d >> 24;
+       
+       fbCombineMaskC (&s, &m);
+       
+       ad = ~m;
+       
+        FbByteAddMulC(d, ad, s, as);
+        WRITE(dest + i, d);
+    }
+}
+
+static FASTCALL void
+fbCombineAddC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+    int i;
+    
+    for (i = 0; i < width; ++i) {
+        uint32_t s = READ(src + i);
+        uint32_t m = READ(mask + i);
+        uint32_t d = READ(dest + i);
+       
+       fbCombineMaskValueC (&s, &m);
+       
+        FbByteAdd(d, s);
+        WRITE(dest + i, d);
+    }
+}
+
+static FASTCALL void
+fbCombineSaturateC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+    int i;
+    
+    for (i = 0; i < width; ++i) {
+        uint32_t  s, d;
+        uint16_t  sa, sr, sg, sb, da;
+        uint16_t  t, u, v;
+        uint32_t  m,n,o,p;
+       
+        d = READ(dest + i);
+        s = READ(src + i);
+       m = READ(mask + i);
+       
+       fbCombineMaskC (&s, &m);
+       
+        sa = (m >> 24);
+        sr = (m >> 16) & 0xff;
+        sg = (m >>  8) & 0xff;
+        sb = (m      ) & 0xff;
+        da = ~d >> 24;
+       
+        if (sb <= da)
+            m = FbAdd(s,d,0,t);
+        else
+            m = FbGen (s, d, 0, (da << 8) / sb, 0xff, t, u, v);
+       
+        if (sg <= da)
+            n = FbAdd(s,d,8,t);
+        else
+            n = FbGen (s, d, 8, (da << 8) / sg, 0xff, t, u, v);
+       
+        if (sr <= da)
+            o = FbAdd(s,d,16,t);
+        else
+            o = FbGen (s, d, 16, (da << 8) / sr, 0xff, t, u, v);
+       
+        if (sa <= da)
+            p = FbAdd(s,d,24,t);
+        else
+            p = FbGen (s, d, 24, (da << 8) / sa, 0xff, t, u, v);
+       
+        WRITE(dest + i, m|n|o|p);
+    }
+}
+
+static FASTCALL void
+fbCombineDisjointGeneralC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width, uint8_t combine)
+{
+    int i;
+    
+    for (i = 0; i < width; ++i) {
+        uint32_t  s, d;
+        uint32_t  m,n,o,p;
+        uint32_t  Fa, Fb;
+        uint16_t  t, u, v;
+        uint32_t  sa;
+        uint8_t   da;
+       
+        s = READ(src + i);
+        m = READ(mask + i);
+        d = READ(dest + i);
+        da = d >> 24;
+       
+       fbCombineMaskC (&s, &m);
+       
+       sa = m;
+       
+        switch (combine & CombineA) {
+        default:
+            Fa = 0;
+            break;
+        case CombineAOut:
+            m = fbCombineDisjointOutPart ((uint8_t) (sa >> 0), da);
+            n = fbCombineDisjointOutPart ((uint8_t) (sa >> 8), da) << 8;
+            o = fbCombineDisjointOutPart ((uint8_t) (sa >> 16), da) << 16;
+            p = fbCombineDisjointOutPart ((uint8_t) (sa >> 24), da) << 24;
+            Fa = m|n|o|p;
+            break;
+        case CombineAIn:
+            m = fbCombineDisjointInPart ((uint8_t) (sa >> 0), da);
+            n = fbCombineDisjointInPart ((uint8_t) (sa >> 8), da) << 8;
+            o = fbCombineDisjointInPart ((uint8_t) (sa >> 16), da) << 16;
+            p = fbCombineDisjointInPart ((uint8_t) (sa >> 24), da) << 24;
+            Fa = m|n|o|p;
+            break;
+        case CombineA:
+            Fa = 0xffffffff;
+            break;
+        }
+       
+        switch (combine & CombineB) {
+        default:
+            Fb = 0;
+            break;
+        case CombineBOut:
+            m = fbCombineDisjointOutPart (da, (uint8_t) (sa >> 0));
+            n = fbCombineDisjointOutPart (da, (uint8_t) (sa >> 8)) << 8;
+            o = fbCombineDisjointOutPart (da, (uint8_t) (sa >> 16)) << 16;
+            p = fbCombineDisjointOutPart (da, (uint8_t) (sa >> 24)) << 24;
+            Fb = m|n|o|p;
+            break;
+        case CombineBIn:
+            m = fbCombineDisjointInPart (da, (uint8_t) (sa >> 0));
+            n = fbCombineDisjointInPart (da, (uint8_t) (sa >> 8)) << 8;
+            o = fbCombineDisjointInPart (da, (uint8_t) (sa >> 16)) << 16;
+            p = fbCombineDisjointInPart (da, (uint8_t) (sa >> 24)) << 24;
+            Fb = m|n|o|p;
+            break;
+        case CombineB:
+            Fb = 0xffffffff;
+            break;
+        }
+        m = FbGen (s,d,0,FbGet8(Fa,0),FbGet8(Fb,0),t, u, v);
+        n = FbGen (s,d,8,FbGet8(Fa,8),FbGet8(Fb,8),t, u, v);
+        o = FbGen (s,d,16,FbGet8(Fa,16),FbGet8(Fb,16),t, u, v);
+        p = FbGen (s,d,24,FbGet8(Fa,24),FbGet8(Fb,24),t, u, v);
+        s = m|n|o|p;
+        WRITE(dest + i, s);
+    }
+}
+
+static FASTCALL void
+fbCombineDisjointOverC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+    fbCombineDisjointGeneralC (dest, src, mask, width, CombineAOver);
+}
+
+static FASTCALL void
+fbCombineDisjointInC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+    fbCombineDisjointGeneralC (dest, src, mask, width, CombineAIn);
+}
+
+static FASTCALL void
+fbCombineDisjointInReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+    fbCombineDisjointGeneralC (dest, src, mask, width, CombineBIn);
+}
+
+static FASTCALL void
+fbCombineDisjointOutC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+    fbCombineDisjointGeneralC (dest, src, mask, width, CombineAOut);
+}
+
+static FASTCALL void
+fbCombineDisjointOutReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+    fbCombineDisjointGeneralC (dest, src, mask, width, CombineBOut);
+}
+
+static FASTCALL void
+fbCombineDisjointAtopC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+    fbCombineDisjointGeneralC (dest, src, mask, width, CombineAAtop);
+}
+
+static FASTCALL void
+fbCombineDisjointAtopReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+    fbCombineDisjointGeneralC (dest, src, mask, width, CombineBAtop);
+}
+
+static FASTCALL void
+fbCombineDisjointXorC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+    fbCombineDisjointGeneralC (dest, src, mask, width, CombineXor);
+}
+
+static FASTCALL void
+fbCombineConjointGeneralC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width, uint8_t combine)
+{
+    int i;
+    
+    for (i = 0; i < width; ++i) {
+        uint32_t  s, d;
+        uint32_t  m,n,o,p;
+        uint32_t  Fa, Fb;
+        uint16_t  t, u, v;
+        uint32_t  sa;
+        uint8_t   da;
+       
+        s = READ(src + i);
+        m = READ(mask + i);
+        d = READ(dest + i);
+        da = d >> 24;
+       
+       fbCombineMaskC (&s, &m);
+       
+        sa = m;
+       
+        switch (combine & CombineA) {
+        default:
+            Fa = 0;
+            break;
+        case CombineAOut:
+            m = fbCombineConjointOutPart ((uint8_t) (sa >> 0), da);
+            n = fbCombineConjointOutPart ((uint8_t) (sa >> 8), da) << 8;
+            o = fbCombineConjointOutPart ((uint8_t) (sa >> 16), da) << 16;
+            p = fbCombineConjointOutPart ((uint8_t) (sa >> 24), da) << 24;
+            Fa = m|n|o|p;
+            break;
+        case CombineAIn:
+            m = fbCombineConjointInPart ((uint8_t) (sa >> 0), da);
+            n = fbCombineConjointInPart ((uint8_t) (sa >> 8), da) << 8;
+            o = fbCombineConjointInPart ((uint8_t) (sa >> 16), da) << 16;
+            p = fbCombineConjointInPart ((uint8_t) (sa >> 24), da) << 24;
+            Fa = m|n|o|p;
+            break;
+        case CombineA:
+            Fa = 0xffffffff;
+            break;
+        }
+       
+        switch (combine & CombineB) {
+        default:
+            Fb = 0;
+            break;
+        case CombineBOut:
+            m = fbCombineConjointOutPart (da, (uint8_t) (sa >> 0));
+            n = fbCombineConjointOutPart (da, (uint8_t) (sa >> 8)) << 8;
+            o = fbCombineConjointOutPart (da, (uint8_t) (sa >> 16)) << 16;
+            p = fbCombineConjointOutPart (da, (uint8_t) (sa >> 24)) << 24;
+            Fb = m|n|o|p;
+            break;
+        case CombineBIn:
+            m = fbCombineConjointInPart (da, (uint8_t) (sa >> 0));
+            n = fbCombineConjointInPart (da, (uint8_t) (sa >> 8)) << 8;
+            o = fbCombineConjointInPart (da, (uint8_t) (sa >> 16)) << 16;
+            p = fbCombineConjointInPart (da, (uint8_t) (sa >> 24)) << 24;
+            Fb = m|n|o|p;
+            break;
+        case CombineB:
+            Fb = 0xffffffff;
+            break;
+        }
+        m = FbGen (s,d,0,FbGet8(Fa,0),FbGet8(Fb,0),t, u, v);
+        n = FbGen (s,d,8,FbGet8(Fa,8),FbGet8(Fb,8),t, u, v);
+        o = FbGen (s,d,16,FbGet8(Fa,16),FbGet8(Fb,16),t, u, v);
+        p = FbGen (s,d,24,FbGet8(Fa,24),FbGet8(Fb,24),t, u, v);
+        s = m|n|o|p;
+        WRITE(dest + i, s);
+    }
+}
+
+static FASTCALL void
+fbCombineConjointOverC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+    fbCombineConjointGeneralC (dest, src, mask, width, CombineAOver);
+}
+
+static FASTCALL void
+fbCombineConjointOverReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+    fbCombineConjointGeneralC (dest, src, mask, width, CombineBOver);
+}
+
+static FASTCALL void
+fbCombineConjointInC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+    fbCombineConjointGeneralC (dest, src, mask, width, CombineAIn);
+}
+
+static FASTCALL void
+fbCombineConjointInReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+    fbCombineConjointGeneralC (dest, src, mask, width, CombineBIn);
+}
+
+static FASTCALL void
+fbCombineConjointOutC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+    fbCombineConjointGeneralC (dest, src, mask, width, CombineAOut);
+}
+
+static FASTCALL void
+fbCombineConjointOutReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+    fbCombineConjointGeneralC (dest, src, mask, width, CombineBOut);
+}
+
+static FASTCALL void
+fbCombineConjointAtopC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+    fbCombineConjointGeneralC (dest, src, mask, width, CombineAAtop);
+}
+
+static FASTCALL void
+fbCombineConjointAtopReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+    fbCombineConjointGeneralC (dest, src, mask, width, CombineBAtop);
+}
+
+static FASTCALL void
+fbCombineConjointXorC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+    fbCombineConjointGeneralC (dest, src, mask, width, CombineXor);
+}
+
+static CombineFuncC fbCombineFuncC[] = {
+    fbCombineClearC,
+    fbCombineSrcC,
+    NULL, /* Dest */
+    fbCombineOverC,
+    fbCombineOverReverseC,
+    fbCombineInC,
+    fbCombineInReverseC,
+    fbCombineOutC,
+    fbCombineOutReverseC,
+    fbCombineAtopC,
+    fbCombineAtopReverseC,
+    fbCombineXorC,
+    fbCombineAddC,
+    fbCombineSaturateC,
+    NULL,
+    NULL,
+    fbCombineClearC,       /* 0x10 */
+    fbCombineSrcC,
+    NULL, /* Dest */
+    fbCombineDisjointOverC,
+    fbCombineSaturateC, /* DisjointOverReverse */
+    fbCombineDisjointInC,
+    fbCombineDisjointInReverseC,
+    fbCombineDisjointOutC,
+    fbCombineDisjointOutReverseC,
+    fbCombineDisjointAtopC,
+    fbCombineDisjointAtopReverseC,
+    fbCombineDisjointXorC,  /* 0x1b */
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    fbCombineClearC,
+    fbCombineSrcC,
+    NULL, /* Dest */
+    fbCombineConjointOverC,
+    fbCombineConjointOverReverseC,
+    fbCombineConjointInC,
+    fbCombineConjointInReverseC,
+    fbCombineConjointOutC,
+    fbCombineConjointOutReverseC,
+    fbCombineConjointAtopC,
+    fbCombineConjointAtopReverseC,
+    fbCombineConjointXorC,
+};
+
+
+FbComposeFunctions composeFunctions = {
+    fbCombineFuncU,
+    fbCombineFuncC,
+    fbCombineMaskU
+};
+
+
+static void fbFetchSolid(bits_image_t * pict, int x, int y, int width, uint32_t *buffer, uint32_t *mask, uint32_t maskBits)
+{
+    uint32_t *bits;
+    uint32_t color;
+    uint32_t *end;
+    fetchPixelProc fetch = fetchPixelProcForPicture(pict);
+    pixman_indexed_t * indexed = pict->indexed;
+    
+    bits = pict->bits;
+    
+    color = fetch(bits, 0, indexed);
+    
+    end = buffer + width;
+    while (buffer < end)
+        WRITE(buffer++, color);
+    fbFinishAccess (pict->pDrawable);
+}
+
+static void fbFetch(bits_image_t * pict, int x, int y, int width, uint32_t *buffer, uint32_t *mask, uint32_t maskBits)
+{
+    uint32_t *bits;
+    uint32_t stride;
+    fetchProc fetch = fetchProcForPicture(pict);
+    pixman_indexed_t * indexed = pict->indexed;
+    
+    bits = pict->bits;
+    stride = pict->rowstride;
+    
+    bits += y*stride;
+    
+    fetch(bits, x, width, buffer, indexed);
+    fbFinishAccess (pict->pDrawable);
+}
+
+#define MOD(a,b) ((a) < 0 ? ((b) - ((-(a) - 1) % (b))) - 1 : (a) % (b))
+#define DIV(a,b) ((((a) < 0) == ((b) < 0)) ? (a) / (b) :               \
+                 ((a) - (b) + 1 - (((b) < 0) << 1)) / (b))
+
+typedef struct
+{
+    uint32_t        left_ag;
+    uint32_t        left_rb;
+    uint32_t        right_ag;
+    uint32_t        right_rb;
+    int32_t       left_x;
+    int32_t       right_x;
+    int32_t       stepper;
+    
+    gradient_stop_t    *stops;
+    int                      num_stops;
+    unsigned int             spread;
+    
+    int                  need_reset;
+} GradientWalker;
+
+static void
+_gradient_walker_init (GradientWalker  *walker,
+                      gradient_t      *gradient,
+                      unsigned int     spread)
+{
+    walker->num_stops = gradient->n_stops;
+    walker->stops     = gradient->stops;
+    walker->left_x    = 0;
+    walker->right_x   = 0x10000;
+    walker->stepper   = 0;
+    walker->left_ag   = 0;
+    walker->left_rb   = 0;
+    walker->right_ag  = 0;
+    walker->right_rb  = 0;
+    walker->spread    = spread;
+    
+    walker->need_reset = TRUE;
+}
+
+static void
+_gradient_walker_reset (GradientWalker  *walker,
+                        pixman_fixed_32_32_t     pos)
+{
+    int32_t                  x, left_x, right_x;
+    pixman_color_t          *left_c, *right_c;
+    int                      n, count = walker->num_stops;
+    gradient_stop_t *      stops = walker->stops;
+    
+    static const pixman_color_t   transparent_black = { 0, 0, 0, 0 };
+    
+    switch (walker->spread)
+    {
+    case PIXMAN_REPEAT_NORMAL:
+       x = (int32_t)pos & 0xFFFF;
+       for (n = 0; n < count; n++)
+           if (x < stops[n].x)
+               break;
+       if (n == 0) {
+           left_x =  stops[count-1].x - 0x10000;
+           left_c = &stops[count-1].color;
+       } else {
+           left_x =  stops[n-1].x;
+           left_c = &stops[n-1].color;
+       }
+       
+       if (n == count) {
+           right_x =  stops[0].x + 0x10000;
+           right_c = &stops[0].color;
+       } else {
+           right_x =  stops[n].x;
+           right_c = &stops[n].color;
+       }
+       left_x  += (pos - x);
+       right_x += (pos - x);
+       break;
+       
+    case PIXMAN_REPEAT_PAD:
+       for (n = 0; n < count; n++)
+           if (pos < stops[n].x)
+               break;
+       
+       if (n == 0) {
+           left_x =  INT32_MIN;
+           left_c = &stops[0].color;
+       } else {
+           left_x =  stops[n-1].x;
+           left_c = &stops[n-1].color;
+       }
+       
+       if (n == count) {
+           right_x =  INT32_MAX;
+           right_c = &stops[n-1].color;
+       } else {
+           right_x =  stops[n].x;
+           right_c = &stops[n].color;
+       }
+       break;
+       
+    case PIXMAN_REPEAT_REFLECT:
+       x = (int32_t)pos & 0xFFFF;
+       if ((int32_t)pos & 0x10000)
+           x = 0x10000 - x;
+       for (n = 0; n < count; n++)
+           if (x < stops[n].x)
+               break;
+       
+       if (n == 0) {
+           left_x =  -stops[0].x;
+           left_c = &stops[0].color;
+       } else {
+           left_x =  stops[n-1].x;
+           left_c = &stops[n-1].color;
+       }
+       
+       if (n == count) {
+           right_x = 0x20000 - stops[n-1].x;
+           right_c = &stops[n-1].color;
+       } else {
+           right_x =  stops[n].x;
+           right_c = &stops[n].color;
+       }
+       
+       if ((int32_t)pos & 0x10000) {
+           pixman_color_t  *tmp_c;
+           int32_t          tmp_x;
+           
+           tmp_x   = 0x10000 - right_x;
+           right_x = 0x10000 - left_x;
+           left_x  = tmp_x;
+           
+           tmp_c   = right_c;
+           right_c = left_c;
+           left_c  = tmp_c;
+           
+           x = 0x10000 - x;
+       }
+       left_x  += (pos - x);
+       right_x += (pos - x);
+       break;
+       
+    default:  /* RepeatNone */
+       for (n = 0; n < count; n++)
+           if (pos < stops[n].x)
+               break;
+       
+       if (n == 0)
+       {
+           left_x  =  INT32_MIN;
+           right_x =  stops[0].x;
+           left_c  = right_c = (pixman_color_t*) &transparent_black;
+       }
+       else if (n == count)
+       {
+           left_x  = stops[n-1].x;
+           right_x = INT32_MAX;
+           left_c  = right_c = (pixman_color_t*) &transparent_black;
+       }
+       else
+       {
+           left_x  =  stops[n-1].x;
+           right_x =  stops[n].x;
+           left_c  = &stops[n-1].color;
+           right_c = &stops[n].color;
+       }
+    }
+    
+    walker->left_x   = left_x;
+    walker->right_x  = right_x;
+    walker->left_ag  = ((left_c->alpha >> 8) << 16)   | (left_c->green >> 8);
+    walker->left_rb  = ((left_c->red & 0xff00) << 8)  | (left_c->blue >> 8);
+    walker->right_ag = ((right_c->alpha >> 8) << 16)  | (right_c->green >> 8);
+    walker->right_rb = ((right_c->red & 0xff00) << 8) | (right_c->blue >> 8);
+    
+    if ( walker->left_x == walker->right_x                ||
+        ( walker->left_ag == walker->right_ag &&
+          walker->left_rb == walker->right_rb )   )
+    {
+       walker->stepper = 0;
+    }
+    else
+    {
+       int32_t width = right_x - left_x;
+       walker->stepper = ((1 << 24) + width/2)/width;
+    }
+    
+    walker->need_reset = FALSE;
+}
+
+#define  GRADIENT_WALKER_NEED_RESET(w,x)                               \
+    ( (w)->need_reset || (x) < (w)->left_x || (x) >= (w)->right_x)
+
+
+/* the following assumes that GRADIENT_WALKER_NEED_RESET(w,x) is FALSE */
+static uint32_t
+_gradient_walker_pixel (GradientWalker  *walker,
+                        pixman_fixed_32_32_t     x)
+{
+    int  dist, idist;
+    uint32_t  t1, t2, a, color;
+    
+    if (GRADIENT_WALKER_NEED_RESET (walker, x))
+        _gradient_walker_reset (walker, x);
+    
+    dist  = ((int)(x - walker->left_x)*walker->stepper) >> 16;
+    idist = 256 - dist;
+    
+    /* combined INTERPOLATE and premultiply */
+    t1 = walker->left_rb*idist + walker->right_rb*dist;
+    t1 = (t1 >> 8) & 0xff00ff;
+    
+    t2  = walker->left_ag*idist + walker->right_ag*dist;
+    t2 &= 0xff00ff00;
+    
+    color = t2 & 0xff000000;
+    a     = t2 >> 24;
+    
+    t1  = t1*a + 0x800080;
+    t1  = (t1 + ((t1 >> 8) & 0xff00ff)) >> 8;
+    
+    t2  = (t2 >> 8)*a + 0x800080;
+    t2  = (t2 + ((t2 >> 8) & 0xff00ff));
+    
+    return (color | (t1 & 0xff00ff) | (t2 & 0xff00));
+}
+
+static void fbFetchSourcePict(source_image_t * pict, int x, int y, int width, uint32_t *buffer, uint32_t *mask, uint32_t maskBits)
+{
+#if 0
+    SourcePictPtr   pGradient = pict->pSourcePict;
+#endif
+    GradientWalker  walker;
+    uint32_t       *end = buffer + width;
+    gradient_t     *gradient;
+    
+    if (pict->common.type == SOLID)
+    {
+       register uint32_t color = ((solid_fill_t *)pict)->color;
+       
+       while (buffer < end)
+           WRITE(buffer++, color);
+       
+       return;
+    }
+    
+    gradient = (gradient_t *)pict;
+    
+    _gradient_walker_init (&walker, gradient, pict->common.repeat);
+    
+    if (pict->common.type == LINEAR) {
+       pixman_vector_t v, unit;
+       pixman_fixed_32_32_t l;
+       pixman_fixed_48_16_t dx, dy, a, b, off;
+       linear_gradient_t *linear = (linear_gradient_t *)pict;
+       
+        /* reference point is the center of the pixel */
+        v.vector[0] = pixman_int_to_fixed(x) + pixman_fixed_1/2;
+        v.vector[1] = pixman_int_to_fixed(y) + pixman_fixed_1/2;
+        v.vector[2] = pixman_fixed_1;
+        if (pict->common.transform) {
+            if (!PictureTransformPoint3d (pict->common.transform, &v))
+                return;
+            unit.vector[0] = pict->common.transform->matrix[0][0];
+            unit.vector[1] = pict->common.transform->matrix[1][0];
+            unit.vector[2] = pict->common.transform->matrix[2][0];
+        } else {
+            unit.vector[0] = pixman_fixed_1;
+            unit.vector[1] = 0;
+            unit.vector[2] = 0;
+        }
+       
+        dx = linear->p2.x - linear->p1.x;
+        dy = linear->p2.y - linear->p1.y;
+        l = dx*dx + dy*dy;
+        if (l != 0) {
+            a = (dx << 32) / l;
+            b = (dy << 32) / l;
+            off = (-a*linear->p1.x - b*linear->p1.y)>>16;
+        }
+        if (l == 0  || (unit.vector[2] == 0 && v.vector[2] == pixman_fixed_1)) {
+            pixman_fixed_48_16_t inc, t;
+            /* affine transformation only */
+            if (l == 0) {
+                t = 0;
+                inc = 0;
+            } else {
+                t = ((a*v.vector[0] + b*v.vector[1]) >> 16) + off;
+                inc = (a * unit.vector[0] + b * unit.vector[1]) >> 16;
+            }
+           
+           if (pict->class == SOURCE_IMAGE_CLASS_VERTICAL)
+           {
+               register uint32_t color;
+               
+               color = _gradient_walker_pixel( &walker, t );
+               while (buffer < end)
+                   WRITE(buffer++, color);
+           }
+           else
+           {
+                if (!mask) {
+                    while (buffer < end)
+                    {
+                        WRITE(buffer, _gradient_walker_pixel (&walker, t));
+                        buffer += 1;
+                        t      += inc;
+                    }
+                } else {
+                    while (buffer < end) {
+                        if (*mask++ & maskBits)
+                        {
+                            WRITE(buffer, _gradient_walker_pixel (&walker, t));
+                        }
+                        buffer += 1;
+                        t      += inc;
+                    }
+                }
+           }
+       }
+       else /* projective transformation */
+       {
+           pixman_fixed_48_16_t t;
+           
+           if (pict->class == SOURCE_IMAGE_CLASS_VERTICAL)
+           {
+               register uint32_t color;
+               
+               if (v.vector[2] == 0)
+               {
+                   t = 0;
+               }
+               else
+               {
+                   pixman_fixed_48_16_t x, y;
+                   
+                   x = ((pixman_fixed_48_16_t) v.vector[0] << 16) / v.vector[2];
+                   y = ((pixman_fixed_48_16_t) v.vector[1] << 16) / v.vector[2];
+                   t = ((a * x + b * y) >> 16) + off;
+               }
+               
+               color = _gradient_walker_pixel( &walker, t );
+               while (buffer < end)
+                   WRITE(buffer++, color);
+           }
+           else
+           {
+               while (buffer < end)
+               {
+                   if (!mask || *mask++ & maskBits)
+                   {
+                       if (v.vector[2] == 0) {
+                           t = 0;
+                       } else {
+                           pixman_fixed_48_16_t x, y;
+                           x = ((pixman_fixed_48_16_t)v.vector[0] << 16) / v.vector[2];
+                           y = ((pixman_fixed_48_16_t)v.vector[1] << 16) / v.vector[2];
+                           t = ((a*x + b*y) >> 16) + off;
+                       }
+                       WRITE(buffer, _gradient_walker_pixel (&walker, t));
+                   }
+                   ++buffer;
+                   v.vector[0] += unit.vector[0];
+                   v.vector[1] += unit.vector[1];
+                   v.vector[2] += unit.vector[2];
+               }
+            }
+        }
+    } else {
+       
+/*
+ * In the radial gradient problem we are given two circles (c₁,r₁) and
+ * (c₂,r₂) that define the gradient itself. Then, for any point p, we
+ * must compute the value(s) of t within [0.0, 1.0] representing the
+ * circle(s) that would color the point.
+ *
+ * There are potentially two values of t since the point p can be
+ * colored by both sides of the circle, (which happens whenever one
+ * circle is not entirely contained within the other).
+ *
+ * If we solve for a value of t that is outside of [0.0, 1.0] then we
+ * use the extend mode (NONE, REPEAT, REFLECT, or PAD) to map to a
+ * value within [0.0, 1.0].
+ *
+ * Here is an illustration of the problem:
+ *
+ *              p₂
+ *           p  •
+ *           •   ╲
+ *        ·       ╲r₂
+ *  p₁ ·           ╲
+ *  •              θ╲
+ *   ╲             ╌╌•
+ *    ╲r₁        ·   c₂
+ *    θ╲    ·
+ *    ╌╌•
+ *      c₁
+ *
+ * Given (c₁,r₁), (c₂,r₂) and p, we must find an angle θ such that two
+ * points p₁ and p₂ on the two circles are collinear with p. Then, the
+ * desired value of t is the ratio of the length of p₁p to the length
+ * of p₁p₂.
+ *
+ * So, we have six unknown values: (p₁x, p₁y), (p₂x, p₂y), θ and t.
+ * We can also write six equations that constrain the problem:
+ *
+ * Point p₁ is a distance r₁ from c₁ at an angle of θ:
+ *
+ *     1. p₁x = c₁x + r₁·cos θ
+ *     2. p₁y = c₁y + r₁·sin θ
+ *
+ * Point p₂ is a distance r₂ from c₂ at an angle of θ:
+ *
+ *     3. p₂x = c₂x + r2·cos θ
+ *     4. p₂y = c₂y + r2·sin θ
+ *
+ * Point p lies at a fraction t along the line segment p₁p₂:
+ *
+ *     5. px = t·p₂x + (1-t)·p₁x
+ *     6. py = t·p₂y + (1-t)·p₁y
+ *
+ * To solve, first subtitute 1-4 into 5 and 6:
+ *
+ * px = t·(c₂x + r₂·cos θ) + (1-t)·(c₁x + r₁·cos θ)
+ * py = t·(c₂y + r₂·sin θ) + (1-t)·(c₁y + r₁·sin θ)
+ *
+ * Then solve each for cos θ and sin θ expressed as a function of t:
+ *
+ * cos θ = (-(c₂x - c₁x)·t + (px - c₁x)) / ((r₂-r₁)·t + r₁)
+ * sin θ = (-(c₂y - c₁y)·t + (py - c₁y)) / ((r₂-r₁)·t + r₁)
+ *
+ * To simplify this a bit, we define new variables for several of the
+ * common terms as shown below:
+ *
+ *              p₂
+ *           p  •
+ *           •   ╲
+ *        ·  ┆    ╲r₂
+ *  p₁ ·     ┆     ╲
+ *  •     pdy┆      ╲
+ *   ╲       ┆       •c₂
+ *    ╲r₁    ┆   ·   ┆
+ *     ╲    ·┆       ┆cdy
+ *      •╌╌╌╌┴╌╌╌╌╌╌╌┘
+ *    c₁  pdx   cdx
+ *
+ * cdx = (c₂x - c₁x)
+ * cdy = (c₂y - c₁y)
+ *  dr =  r₂-r₁
+ * pdx =  px - c₁x
+ * pdy =  py - c₁y
+ *
+ * Note that cdx, cdy, and dr do not depend on point p at all, so can
+ * be pre-computed for the entire gradient. The simplifed equations
+ * are now:
+ *
+ * cos θ = (-cdx·t + pdx) / (dr·t + r₁)
+ * sin θ = (-cdy·t + pdy) / (dr·t + r₁)
+ *
+ * Finally, to get a single function of t and eliminate the last
+ * unknown θ, we use the identity sin²θ + cos²θ = 1. First, square
+ * each equation, (we knew a quadratic was coming since it must be
+ * possible to obtain two solutions in some cases):
+ *
+ * cos²θ = (cdx²t² - 2·cdx·pdx·t + pdx²) / (dr²·t² + 2·r₁·dr·t + r₁²)
+ * sin²θ = (cdy²t² - 2·cdy·pdy·t + pdy²) / (dr²·t² + 2·r₁·dr·t + r₁²)
+ *
+ * Then add both together, set the result equal to 1, and express as a
+ * standard quadratic equation in t of the form At² + Bt + C = 0
+ *
+ * (cdx² + cdy² - dr²)·t² - 2·(cdx·pdx + cdy·pdy + r₁·dr)·t + (pdx² + pdy² - r₁²) = 0
+ *
+ * In other words:
+ *
+ * A = cdx² + cdy² - dr²
+ * B = -2·(pdx·cdx + pdy·cdy + r₁·dr)
+ * C = pdx² + pdy² - r₁²
+ *
+ * And again, notice that A does not depend on p, so can be
+ * precomputed. From here we just use the quadratic formula to solve
+ * for t:
+ *
+ * t = (-2·B ± ⎷(B² - 4·A·C)) / 2·A
+ */
+        /* radial or conical */
+        pixman_bool_t affine = TRUE;
+        double cx = 1.;
+        double cy = 0.;
+        double cz = 0.;
+       double rx = x + 0.5;
+       double ry = y + 0.5;
+        double rz = 1.;
+       
+        if (pict->common.transform) {
+            pixman_vector_t v;
+            /* reference point is the center of the pixel */
+            v.vector[0] = pixman_int_to_fixed(x) + pixman_fixed_1/2;
+            v.vector[1] = pixman_int_to_fixed(y) + pixman_fixed_1/2;
+            v.vector[2] = pixman_fixed_1;
+            if (!PictureTransformPoint3d (pict->common.transform, &v))
+                return;
+           
+            cx = pict->common.transform->matrix[0][0]/65536.;
+            cy = pict->common.transform->matrix[1][0]/65536.;
+            cz = pict->common.transform->matrix[2][0]/65536.;
+            rx = v.vector[0]/65536.;
+            ry = v.vector[1]/65536.;
+            rz = v.vector[2]/65536.;
+            affine = pict->common.transform->matrix[2][0] == 0 && v.vector[2] == pixman_fixed_1;
+        }
+       
+        if (pict->common.type == RADIAL) {
+           radial_gradient_t *radial = (radial_gradient_t *)pict;
+            if (affine) {
+                while (buffer < end) {
+                   if (!mask || *mask++ & maskBits)
+                   {
+                       double pdx, pdy;
+                       double B, C;
+                       double det;
+                       double c1x = radial->c1.x / 65536.0;
+                       double c1y = radial->c1.y / 65536.0;
+                       double r1  = radial->c1.radius / 65536.0;
+                        pixman_fixed_48_16_t t;
+                       
+                       pdx = rx - c1x;
+                       pdy = ry - c1y;
+                       
+                       B = -2 * (  pdx * radial->cdx
+                                   + pdy * radial->cdy
+                                   + r1 * radial->dr);
+                       C = (pdx * pdx + pdy * pdy - r1 * r1);
+                       
+                        det = (B * B) - (4 * radial->A * C);
+                       if (det < 0.0)
+                           det = 0.0;
+                       
+                       if (radial->A < 0)
+                           t = (pixman_fixed_48_16_t) ((- B - sqrt(det)) / (2.0 * radial->A) * 65536);
+                       else
+                           t = (pixman_fixed_48_16_t) ((- B + sqrt(det)) / (2.0 * radial->A) * 65536);
+                       
+                       WRITE(buffer, _gradient_walker_pixel (&walker, t));
+                   }
+                   ++buffer;
+                   
+                    rx += cx;
+                    ry += cy;
+                }
+            } else {
+               /* projective */
+                while (buffer < end) {
+                   if (!mask || *mask++ & maskBits)
+                   {
+                       double pdx, pdy;
+                       double B, C;
+                       double det;
+                       double c1x = radial->c1.x / 65536.0;
+                       double c1y = radial->c1.y / 65536.0;
+                       double r1  = radial->c1.radius / 65536.0;
+                        pixman_fixed_48_16_t t;
+                       double x, y;
+                       
+                       if (rz != 0) {
+                           x = rx/rz;
+                           y = ry/rz;
+                       } else {
+                           x = y = 0.;
+                       }
+                       
+                       pdx = x - c1x;
+                       pdy = y - c1y;
+                       
+                       B = -2 * (  pdx * radial->cdx
+                                   + pdy * radial->cdy
+                                   + r1 * radial->dr);
+                       C = (pdx * pdx + pdy * pdy - r1 * r1);
+                       
+                        det = (B * B) - (4 * radial->A * C);
+                       if (det < 0.0)
+                           det = 0.0;
+                       
+                       if (radial->A < 0)
+                           t = (pixman_fixed_48_16_t) ((- B - sqrt(det)) / (2.0 * radial->A) * 65536);
+                       else
+                           t = (pixman_fixed_48_16_t) ((- B + sqrt(det)) / (2.0 * radial->A) * 65536);
+                       
+                       WRITE(buffer, _gradient_walker_pixel (&walker, t));
+                   }
+                   ++buffer;
+                   
+                    rx += cx;
+                    ry += cy;
+                   rz += cz;
+                }
+            }
+        } else /* SourcePictTypeConical */ {
+           conical_gradient_t *conical = (conical_gradient_t *)pict;
+            double a = conical->angle/(180.*65536);
+            if (affine) {
+                rx -= conical->center.x/65536.;
+                ry -= conical->center.y/65536.;
+               
+                while (buffer < end) {
+                   double angle;
+                   
+                    if (!mask || *mask++ & maskBits)
+                   {
+                        pixman_fixed_48_16_t   t;
+                       
+                        angle = atan2(ry, rx) + a;
+                       t     = (pixman_fixed_48_16_t) (angle * (65536. / (2*M_PI)));
+                       
+                       WRITE(buffer, _gradient_walker_pixel (&walker, t));
+                   }
+                   
+                    ++buffer;
+                    rx += cx;
+                    ry += cy;
+                }
+            } else {
+                while (buffer < end) {
+                    double x, y;
+                    double angle;
+                   
+                    if (!mask || *mask++ & maskBits)
+                    {
+                       pixman_fixed_48_16_t  t;
+                       
+                       if (rz != 0) {
+                           x = rx/rz;
+                           y = ry/rz;
+                       } else {
+                           x = y = 0.;
+                       }
+                       x -= conical->center.x/65536.;
+                       y -= conical->center.y/65536.;
+                       angle = atan2(y, x) + a;
+                       t     = (pixman_fixed_48_16_t) (angle * (65536. / (2*M_PI)));
+                       
+                       WRITE(buffer, _gradient_walker_pixel (&walker, t));
+                   }
+                   
+                    ++buffer;
+                    rx += cx;
+                    ry += cy;
+                    rz += cz;
+                }
+            }
+        }
+    }
+}
+
+static void fbFetchTransformed(bits_image_t * pict, int x, int y, int width, uint32_t *buffer, uint32_t *mask, uint32_t maskBits)
+{
+    uint32_t     *bits;
+    uint32_t    stride;
+    fetchPixelProc   fetch;
+    pixman_vector_t    v;
+    pixman_vector_t  unit;
+    int         i;
+    pixman_box16_t box;
+    pixman_indexed_t * indexed = (pixman_indexed_t *) pict->indexed;
+    pixman_bool_t affine = TRUE;
+    
+    fetch = fetchPixelProcForPicture(pict);
+
+    bits = pict->bits;
+    stride = pict->rowstride;
+    
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed(x) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed(y) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+    
+    /* when using convolution filters one might get here without a transform */
+    if (pict->common.transform)
+    {
+        if (!PictureTransformPoint3d (pict->common.transform, &v))
+       {
+            fbFinishAccess (pict->pDrawable);
+            return;
+        }
+        unit.vector[0] = pict->common.transform->matrix[0][0];
+        unit.vector[1] = pict->common.transform->matrix[1][0];
+        unit.vector[2] = pict->common.transform->matrix[2][0];
+        affine = v.vector[2] == pixman_fixed_1 && unit.vector[2] == 0;
+    }
+    else
+    {
+        unit.vector[0] = pixman_fixed_1;
+        unit.vector[1] = 0;
+        unit.vector[2] = 0;
+    }
+    
+    if (pict->common.filter == PIXMAN_FILTER_NEAREST || pict->common.filter == PIXMAN_FILTER_FAST)
+    {
+        if (pict->common.repeat == PIXMAN_REPEAT_NORMAL) {
+            if (pixman_region_n_rects (pict->common.clip_region) == 1) {
+               for (i = 0; i < width; ++i) {
+                   if (!mask || mask[i] & maskBits)
+                   {
+                       if (!v.vector[2]) {
+                           WRITE(buffer + i, 0);
+                       } else {
+                           if (!affine) {
+                               y = MOD(DIV(v.vector[1],v.vector[2]), pict->height);
+                               x = MOD(DIV(v.vector[0],v.vector[2]), pict->width);
+                           } else {
+                               y = MOD(v.vector[1]>>16, pict->height);
+                               x = MOD(v.vector[0]>>16, pict->width);
+                           }
+                           WRITE(buffer + i, fetch(bits + y * stride, x, indexed));
+                       }
+                   }
+                   
+                    v.vector[0] += unit.vector[0];
+                    v.vector[1] += unit.vector[1];
+                    v.vector[2] += unit.vector[2];
+                }
+            } else {
+                for (i = 0; i < width; ++i) {
+                   if (!mask || mask[i] & maskBits)
+                   {
+                       if (!v.vector[2]) {
+                           WRITE(buffer + i, 0);
+                       } else {
+                           if (!affine) {
+                               y = MOD(DIV(v.vector[1],v.vector[2]), pict->height);
+                               x = MOD(DIV(v.vector[0],v.vector[2]), pict->width);
+                           } else {
+                               y = MOD(v.vector[1]>>16, pict->height);
+                               x = MOD(v.vector[0]>>16, pict->width);
+                           }
+                           if (pixman_region_contains_point (pict->common.clip_region, x, y, &box))
+                               WRITE(buffer + i, fetch(bits + y*stride, x, indexed));
+                           else
+                               WRITE(buffer + i, 0);
+                       }
+                   }
+                   
+                    v.vector[0] += unit.vector[0];
+                    v.vector[1] += unit.vector[1];
+                    v.vector[2] += unit.vector[2];
+                }
+            }
+        } else {
+            if (pixman_region_n_rects(pict->common.clip_region) == 1) {
+                box = pict->common.clip_region->extents;
+                for (i = 0; i < width; ++i) {
+                   if (!mask || mask[i] & maskBits)
+                   {
+                       if (!v.vector[2]) {
+                           WRITE(buffer + i, 0);
+                       } else {
+                           if (!affine) {
+                               y = DIV(v.vector[1],v.vector[2]);
+                               x = DIV(v.vector[0],v.vector[2]);
+                           } else {
+                               y = v.vector[1]>>16;
+                               x = v.vector[0]>>16;
+                           }
+                           WRITE(buffer + i, ((x < box.x1) | (x >= box.x2) | (y < box.y1) | (y >= box.y2)) ?
+                                 0 : fetch(bits + (y)*stride, x, indexed));
+                       }
+                   }
+                    v.vector[0] += unit.vector[0];
+                    v.vector[1] += unit.vector[1];
+                    v.vector[2] += unit.vector[2];
+                }
+            } else {
+                for (i = 0; i < width; ++i) {
+                    if (!mask || mask[i] & maskBits)
+                   {
+                       if (!v.vector[2]) {
+                           WRITE(buffer + i, 0);
+                       } else {
+                           if (!affine) {
+                               y = DIV(v.vector[1],v.vector[2]);
+                               x = DIV(v.vector[0],v.vector[2]);
+                           } else {
+                               y = v.vector[1]>>16;
+                               x = v.vector[0]>>16;
+                           }
+                           if (pixman_region_contains_point (pict->common.clip_region, x, y, &box))
+                               WRITE(buffer + i, fetch(bits + y*stride, x, indexed));
+                           else
+                               WRITE(buffer + i, 0);
+                       }
+                   }
+                    v.vector[0] += unit.vector[0];
+                    v.vector[1] += unit.vector[1];
+                    v.vector[2] += unit.vector[2];
+                }
+            }
+        }
+    } else if (pict->common.filter == PIXMAN_FILTER_BILINEAR   ||
+              pict->common.filter == PIXMAN_FILTER_GOOD        ||
+              pict->common.filter == PIXMAN_FILTER_BEST)
+    {
+        /* adjust vector for maximum contribution at 0.5, 0.5 of each texel. */
+        v.vector[0] -= v.vector[2] / 2;
+        v.vector[1] -= v.vector[2] / 2;
+        unit.vector[0] -= unit.vector[2] / 2;
+        unit.vector[1] -= unit.vector[2] / 2;
+       
+        if (pict->common.repeat == PIXMAN_REPEAT_NORMAL) {
+            if (pixman_region_n_rects(pict->common.clip_region) == 1) {
+                for (i = 0; i < width; ++i) {
+                    if (!mask || mask[i] & maskBits)
+                   {
+                       if (!v.vector[2]) {
+                           WRITE(buffer + i, 0);
+                       } else {
+                           int x1, x2, y1, y2, distx, idistx, disty, idisty;
+                           uint32_t *b;
+                           uint32_t tl, tr, bl, br, r;
+                           uint32_t ft, fb;
+                           
+                           if (!affine) {
+                               pixman_fixed_48_16_t div;
+                               div = ((pixman_fixed_48_16_t)v.vector[0] << 16)/v.vector[2];
+                               x1 = div >> 16;
+                               distx = ((pixman_fixed_t)div >> 8) & 0xff;
+                               div = ((pixman_fixed_48_16_t)v.vector[1] << 16)/v.vector[2];
+                               y1 = div >> 16;
+                               disty = ((pixman_fixed_t)div >> 8) & 0xff;
+                           } else {
+                               x1 = v.vector[0] >> 16;
+                               distx = (v.vector[0] >> 8) & 0xff;
+                               y1 = v.vector[1] >> 16;
+                               disty = (v.vector[1] >> 8) & 0xff;
+                           }
+                           x2 = x1 + 1;
+                           y2 = y1 + 1;
+                           
+                           idistx = 256 - distx;
+                           idisty = 256 - disty;
+                           
+                           x1 = MOD (x1, pict->width);
+                           x2 = MOD (x2, pict->width);
+                           y1 = MOD (y1, pict->height);
+                           y2 = MOD (y2, pict->height);
+                           
+                           b = bits + y1*stride;
+                           
+                           tl = fetch(b, x1, indexed);
+                           tr = fetch(b, x2, indexed);
+                           b = bits + y2*stride;
+                           bl = fetch(b, x1, indexed);
+                           br = fetch(b, x2, indexed);
+                           
+                           ft = FbGet8(tl,0) * idistx + FbGet8(tr,0) * distx;
+                           fb = FbGet8(bl,0) * idistx + FbGet8(br,0) * distx;
+                           r = (((ft * idisty + fb * disty) >> 16) & 0xff);
+                           ft = FbGet8(tl,8) * idistx + FbGet8(tr,8) * distx;
+                           fb = FbGet8(bl,8) * idistx + FbGet8(br,8) * distx;
+                           r |= (((ft * idisty + fb * disty) >> 8) & 0xff00);
+                           ft = FbGet8(tl,16) * idistx + FbGet8(tr,16) * distx;
+                           fb = FbGet8(bl,16) * idistx + FbGet8(br,16) * distx;
+                           r |= (((ft * idisty + fb * disty)) & 0xff0000);
+                           ft = FbGet8(tl,24) * idistx + FbGet8(tr,24) * distx;
+                           fb = FbGet8(bl,24) * idistx + FbGet8(br,24) * distx;
+                           r |= (((ft * idisty + fb * disty) << 8) & 0xff000000);
+                           WRITE(buffer + i, r);
+                       }
+                   }
+                    v.vector[0] += unit.vector[0];
+                    v.vector[1] += unit.vector[1];
+                    v.vector[2] += unit.vector[2];
+                }
+            } else {
+                for (i = 0; i < width; ++i) {
+                   if (!mask || mask[i] & maskBits)
+                   {
+                       if (!v.vector[2]) {
+                           WRITE(buffer + i, 0);
+                       } else {
+                           int x1, x2, y1, y2, distx, idistx, disty, idisty;
+                           uint32_t *b;
+                           uint32_t tl, tr, bl, br, r;
+                           uint32_t ft, fb;
+                           
+                           if (!affine) {
+                               pixman_fixed_48_16_t div;
+                               div = ((pixman_fixed_48_16_t)v.vector[0] << 16)/v.vector[2];
+                               x1 = div >> 16;
+                               distx = ((pixman_fixed_t)div >> 8) & 0xff;
+                               div = ((pixman_fixed_48_16_t)v.vector[1] << 16)/v.vector[2];
+                               y1 = div >> 16;
+                               disty = ((pixman_fixed_t)div >> 8) & 0xff;
+                           } else {
+                               x1 = v.vector[0] >> 16;
+                               distx = (v.vector[0] >> 8) & 0xff;
+                               y1 = v.vector[1] >> 16;
+                               disty = (v.vector[1] >> 8) & 0xff;
+                           }
+                           x2 = x1 + 1;
+                           y2 = y1 + 1;
+                           
+                           idistx = 256 - distx;
+                           idisty = 256 - disty;
+                           
+                           x1 = MOD (x1, pict->width);
+                           x2 = MOD (x2, pict->width);
+                           y1 = MOD (y1, pict->height);
+                           y2 = MOD (y2, pict->height);
+                           
+                           b = bits + y1*stride;
+                           
+                           tl = pixman_region_contains_point(pict->common.clip_region, x1, y1, &box)
+                               ? fetch(b, x1, indexed) : 0;
+                           tr = pixman_region_contains_point(pict->common.clip_region, x2, y1, &box)
+                               ? fetch(b, x2, indexed) : 0;
+                           b = bits + (y2)*stride;
+                           bl = pixman_region_contains_point(pict->common.clip_region, x1, y2, &box)
+                               ? fetch(b, x1, indexed) : 0;
+                           br = pixman_region_contains_point(pict->common.clip_region, x2, y2, &box)
+                               ? fetch(b, x2, indexed) : 0;
+                           
+                           ft = FbGet8(tl,0) * idistx + FbGet8(tr,0) * distx;
+                           fb = FbGet8(bl,0) * idistx + FbGet8(br,0) * distx;
+                           r = (((ft * idisty + fb * disty) >> 16) & 0xff);
+                           ft = FbGet8(tl,8) * idistx + FbGet8(tr,8) * distx;
+                           fb = FbGet8(bl,8) * idistx + FbGet8(br,8) * distx;
+                           r |= (((ft * idisty + fb * disty) >> 8) & 0xff00);
+                           ft = FbGet8(tl,16) * idistx + FbGet8(tr,16) * distx;
+                           fb = FbGet8(bl,16) * idistx + FbGet8(br,16) * distx;
+                           r |= (((ft * idisty + fb * disty)) & 0xff0000);
+                           ft = FbGet8(tl,24) * idistx + FbGet8(tr,24) * distx;
+                           fb = FbGet8(bl,24) * idistx + FbGet8(br,24) * distx;
+                           r |= (((ft * idisty + fb * disty) << 8) & 0xff000000);
+                           WRITE(buffer + i, r);
+                       }
+                   }
+                   
+                    v.vector[0] += unit.vector[0];
+                    v.vector[1] += unit.vector[1];
+                    v.vector[2] += unit.vector[2];
+                }
+            }
+        } else {
+            if (pixman_region_n_rects(pict->common.clip_region) == 1) {
+                box = pict->common.clip_region->extents;
+                for (i = 0; i < width; ++i) {
+                   if (!mask || mask[i] & maskBits)
+                   {
+                       if (!v.vector[2]) {
+                           WRITE(buffer + i, 0);
+                       } else {
+                           int x1, x2, y1, y2, distx, idistx, disty, idisty, x_off;
+                           uint32_t *b;
+                           uint32_t tl, tr, bl, br, r;
+                           pixman_bool_t x1_out, x2_out, y1_out, y2_out;
+                           uint32_t ft, fb;
+                           
+                           if (!affine) {
+                               pixman_fixed_48_16_t div;
+                               div = ((pixman_fixed_48_16_t)v.vector[0] << 16)/v.vector[2];
+                               x1 = div >> 16;
+                               distx = ((pixman_fixed_t)div >> 8) & 0xff;
+                               div = ((pixman_fixed_48_16_t)v.vector[1] << 16)/v.vector[2];
+                               y1 = div >> 16;
+                               disty = ((pixman_fixed_t)div >> 8) & 0xff;
+                           } else {
+                               x1 = v.vector[0] >> 16;
+                               distx = (v.vector[0] >> 8) & 0xff;
+                               y1 = v.vector[1] >> 16;
+                               disty = (v.vector[1] >> 8) & 0xff;
+                           }
+                           x2 = x1 + 1;
+                           y2 = y1 + 1;
+                           
+                           idistx = 256 - distx;
+                           idisty = 256 - disty;
+                           
+                           b = bits + (y1)*stride;
+                           x_off = x1;
+                           
+                           x1_out = (x1 < box.x1) | (x1 >= box.x2);
+                           x2_out = (x2 < box.x1) | (x2 >= box.x2);
+                           y1_out = (y1 < box.y1) | (y1 >= box.y2);
+                           y2_out = (y2 < box.y1) | (y2 >= box.y2);
+                           
+                           tl = x1_out|y1_out ? 0 : fetch(b, x_off, indexed);
+                           tr = x2_out|y1_out ? 0 : fetch(b, x_off + 1, indexed);
+                           b += stride;
+                           bl = x1_out|y2_out ? 0 : fetch(b, x_off, indexed);
+                           br = x2_out|y2_out ? 0 : fetch(b, x_off + 1, indexed);
+                           
+                           ft = FbGet8(tl,0) * idistx + FbGet8(tr,0) * distx;
+                           fb = FbGet8(bl,0) * idistx + FbGet8(br,0) * distx;
+                           r = (((ft * idisty + fb * disty) >> 16) & 0xff);
+                           ft = FbGet8(tl,8) * idistx + FbGet8(tr,8) * distx;
+                           fb = FbGet8(bl,8) * idistx + FbGet8(br,8) * distx;
+                           r |= (((ft * idisty + fb * disty) >> 8) & 0xff00);
+                           ft = FbGet8(tl,16) * idistx + FbGet8(tr,16) * distx;
+                           fb = FbGet8(bl,16) * idistx + FbGet8(br,16) * distx;
+                           r |= (((ft * idisty + fb * disty)) & 0xff0000);
+                           ft = FbGet8(tl,24) * idistx + FbGet8(tr,24) * distx;
+                           fb = FbGet8(bl,24) * idistx + FbGet8(br,24) * distx;
+                           r |= (((ft * idisty + fb * disty) << 8) & 0xff000000);
+                           WRITE(buffer + i, r);
+                       }
+                   }
+                   
+                    v.vector[0] += unit.vector[0];
+                    v.vector[1] += unit.vector[1];
+                    v.vector[2] += unit.vector[2];
+                }
+            } else {
+                for (i = 0; i < width; ++i) {
+                    if (!mask || mask[i] & maskBits)
+                   {
+                       if (!v.vector[2]) {
+                           WRITE(buffer + i, 0);
+                       } else {
+                           int x1, x2, y1, y2, distx, idistx, disty, idisty, x_off;
+                           uint32_t *b;
+                           uint32_t tl, tr, bl, br, r;
+                           uint32_t ft, fb;
+                           
+                           if (!affine) {
+                               pixman_fixed_48_16_t div;
+                               div = ((pixman_fixed_48_16_t)v.vector[0] << 16)/v.vector[2];
+                               x1 = div >> 16;
+                               distx = ((pixman_fixed_t)div >> 8) & 0xff;
+                               div = ((pixman_fixed_48_16_t)v.vector[1] << 16)/v.vector[2];
+                               y1 = div >> 16;
+                               disty = ((pixman_fixed_t)div >> 8) & 0xff;
+                           } else {
+                               x1 = v.vector[0] >> 16;
+                               distx = (v.vector[0] >> 8) & 0xff;
+                               y1 = v.vector[1] >> 16;
+                               disty = (v.vector[1] >> 8) & 0xff;
+                           }
+                           x2 = x1 + 1;
+                           y2 = y1 + 1;
+                           
+                           idistx = 256 - distx;
+                           idisty = 256 - disty;
+                           
+                           b = bits + (y1)*stride;
+                           x_off = x1;
+                           
+                           tl = pixman_region_contains_point(pict->common.clip_region, x1, y1, &box)
+                               ? fetch(b, x_off, indexed) : 0;
+                           tr = pixman_region_contains_point(pict->common.clip_region, x2, y1, &box)
+                               ? fetch(b, x_off + 1, indexed) : 0;
+                           b += stride;
+                           bl = pixman_region_contains_point(pict->common.clip_region, x1, y2, &box)
+                               ? fetch(b, x_off, indexed) : 0;
+                           br = pixman_region_contains_point(pict->common.clip_region, x2, y2, &box)
+                               ? fetch(b, x_off + 1, indexed) : 0;
+                           
+                           ft = FbGet8(tl,0) * idistx + FbGet8(tr,0) * distx;
+                           fb = FbGet8(bl,0) * idistx + FbGet8(br,0) * distx;
+                           r = (((ft * idisty + fb * disty) >> 16) & 0xff);
+                           ft = FbGet8(tl,8) * idistx + FbGet8(tr,8) * distx;
+                           fb = FbGet8(bl,8) * idistx + FbGet8(br,8) * distx;
+                           r |= (((ft * idisty + fb * disty) >> 8) & 0xff00);
+                           ft = FbGet8(tl,16) * idistx + FbGet8(tr,16) * distx;
+                           fb = FbGet8(bl,16) * idistx + FbGet8(br,16) * distx;
+                           r |= (((ft * idisty + fb * disty)) & 0xff0000);
+                           ft = FbGet8(tl,24) * idistx + FbGet8(tr,24) * distx;
+                           fb = FbGet8(bl,24) * idistx + FbGet8(br,24) * distx;
+                           r |= (((ft * idisty + fb * disty) << 8) & 0xff000000);
+                           WRITE(buffer + i, r);
+                       }
+                   }
+                   
+                    v.vector[0] += unit.vector[0];
+                    v.vector[1] += unit.vector[1];
+                    v.vector[2] += unit.vector[2];
+                }
+            }
+        }
+    } else if (pict->common.filter == PIXMAN_FILTER_CONVOLUTION) {
+        pixman_fixed_t *params = pict->common.filter_params;
+        int32_t cwidth = pixman_fixed_to_int(params[0]);
+        int32_t cheight = pixman_fixed_to_int(params[1]);
+        int xoff = (params[0] - pixman_fixed_1) >> 1;
+       int yoff = (params[1] - pixman_fixed_1) >> 1;
+        params += 2;
+        for (i = 0; i < width; ++i) {
+           if (!mask || mask[i] & maskBits)
+           {
+               if (!v.vector[2]) {
+                   WRITE(buffer + i, 0);
+               } else {
+                   int x1, x2, y1, y2, x, y;
+                   int32_t srtot, sgtot, sbtot, satot;
+                   pixman_fixed_t *p = params;
+                   
+                   if (!affine) {
+                       pixman_fixed_48_16_t tmp;
+                       tmp = ((pixman_fixed_48_16_t)v.vector[0] << 16)/v.vector[2] - xoff;
+                       x1 = pixman_fixed_to_int(tmp);
+                       tmp = ((pixman_fixed_48_16_t)v.vector[1] << 16)/v.vector[2] - yoff;
+                       y1 = pixman_fixed_to_int(tmp);
+                   } else {
+                       x1 = pixman_fixed_to_int(v.vector[0] - xoff);
+                       y1 = pixman_fixed_to_int(v.vector[1] - yoff);
+                   }
+                   x2 = x1 + cwidth;
+                   y2 = y1 + cheight;
+                   
+                   srtot = sgtot = sbtot = satot = 0;
+                   
+                   for (y = y1; y < y2; y++) {
+                       int ty = (pict->common.repeat == PIXMAN_REPEAT_NORMAL) ? MOD (y, pict->height) : y;
+                       for (x = x1; x < x2; x++) {
+                           if (*p) {
+                               int tx = (pict->common.repeat == PIXMAN_REPEAT_NORMAL) ? MOD (x, pict->width) : x;
+                               if (pixman_region_contains_point (pict->common.clip_region, tx, ty, &box)) {
+                                   uint32_t *b = bits + (ty)*stride;
+                                   uint32_t c = fetch(b, tx, indexed);
+                                   
+                                   srtot += Red(c) * *p;
+                                   sgtot += Green(c) * *p;
+                                   sbtot += Blue(c) * *p;
+                                   satot += Alpha(c) * *p;
+                               }
+                           }
+                           p++;
+                       }
+                   }
+                   
+                   satot >>= 16;
+                   srtot >>= 16;
+                   sgtot >>= 16;
+                   sbtot >>= 16;
+                   
+                   if (satot < 0) satot = 0; else if (satot > 0xff) satot = 0xff;
+                   if (srtot < 0) srtot = 0; else if (srtot > 0xff) srtot = 0xff;
+                   if (sgtot < 0) sgtot = 0; else if (sgtot > 0xff) sgtot = 0xff;
+                   if (sbtot < 0) sbtot = 0; else if (sbtot > 0xff) sbtot = 0xff;
+                   
+                   WRITE(buffer + i, ((satot << 24) |
+                                      (srtot << 16) |
+                                      (sgtot <<  8) |
+                                      (sbtot       )));
+               }
+           }
+            v.vector[0] += unit.vector[0];
+            v.vector[1] += unit.vector[1];
+            v.vector[2] += unit.vector[2];
+        }
+    }
+    
+    fbFinishAccess (pict->pDrawable);
+}
+
+
+static void fbFetchExternalAlpha(bits_image_t * pict, int x, int y, int width, uint32_t *buffer, uint32_t *mask, uint32_t maskBits)
+{
+    int i;
+    uint32_t _alpha_buffer[SCANLINE_BUFFER_LENGTH];
+    uint32_t *alpha_buffer = _alpha_buffer;
+    
+    if (!pict->common.alpha_map) {
+        fbFetchTransformed (pict, x, y, width, buffer, mask, maskBits);
+       return;
+    }
+    if (width > SCANLINE_BUFFER_LENGTH)
+        alpha_buffer = (uint32_t *) malloc(width*sizeof(uint32_t));
+    
+    fbFetchTransformed(pict, x, y, width, buffer, mask, maskBits);
+    fbFetchTransformed((bits_image_t *)pict->common.alpha_map, x - pict->common.alpha_origin.x,
+                      y - pict->common.alpha_origin.y, width, alpha_buffer,
+                      mask, maskBits);
+    for (i = 0; i < width; ++i) {
+        if (!mask || mask[i] & maskBits)
+       {
+           int a = alpha_buffer[i]>>24;
+           WRITE(buffer + i, (a << 24)
+                 | (div_255(Red(READ(buffer + i)) * a) << 16)
+                 | (div_255(Green(READ(buffer + i)) * a) << 8)
+                 | (div_255(Blue(READ(buffer + i)) * a)));
+       }
+    }
+    
+    if (alpha_buffer != _alpha_buffer)
+        free(alpha_buffer);
+}
+
+static void fbStore(bits_image_t * pict, int x, int y, int width, uint32_t *buffer)
+{
+    uint32_t *bits;
+    uint32_t stride;
+    storeProc store = storeProcForPicture(pict);
+    pixman_indexed_t * indexed = (pixman_indexed_t *) pict->indexed;
+
+    bits = pict->bits;
+    stride = pict->rowstride;
+    bits += y*stride;
+    store(bits, buffer, x, width, indexed);
+    fbFinishAccess (pict->pDrawable);
+}
+
+static void fbStoreExternalAlpha(bits_image_t * pict, int x, int y, int width, uint32_t *buffer)
+{
+    uint32_t *bits, *alpha_bits;
+    uint32_t stride, astride;
+    int ax, ay;
+    storeProc store;
+    storeProc astore;
+    pixman_indexed_t * indexed = (pixman_indexed_t *) pict->indexed;
+    pixman_indexed_t * aindexed;
+    
+    if (!pict->common.alpha_map) {
+        fbStore(pict, x, y, width, buffer);
+       return;
+    }
+    
+    store = storeProcForPicture(pict);
+    astore = storeProcForPicture(pict->common.alpha_map);
+    aindexed = pict->common.alpha_map->indexed;
+    
+    ax = x;
+    ay = y;
+
+    bits = pict->bits;
+    stride = pict->rowstride;
+
+    alpha_bits = pict->common.alpha_map->bits;
+    astride = pict->common.alpha_map->rowstride;
+    
+    bits       += y*stride;
+    alpha_bits += (ay - pict->common.alpha_origin.y)*astride;
+    
+    
+    store(bits, buffer, x, width, indexed);
+    astore(alpha_bits, buffer, ax - pict->common.alpha_origin.x, width, aindexed);
+    
+    fbFinishAccess (pict->alpha_map->pDrawable);
+    fbFinishAccess (pict->pDrawable);
+}
+
+typedef void (*scanStoreProc)(image_t * , int , int , int , uint32_t *);
+typedef void (*scanFetchProc)(image_t * , int , int , int , uint32_t * , uint32_t *, uint32_t);
+
+void
+fbCompositeRect (const FbComposeData *data, uint32_t *scanline_buffer)
+{
+    uint32_t *src_buffer = scanline_buffer;
+    uint32_t *dest_buffer = src_buffer + data->width;
+    int i;
+    scanStoreProc store;
+    scanFetchProc fetchSrc = NULL, fetchMask = NULL, fetchDest = NULL;
+    unsigned int srcClass = SOURCE_IMAGE_CLASS_UNKNOWN;
+    unsigned int maskClass = SOURCE_IMAGE_CLASS_UNKNOWN;
+    uint32_t *bits;
+    uint32_t stride;
+    int xoff, yoff;
+    
+    if (data->op == PIXMAN_OP_CLEAR)
+        fetchSrc = NULL;
+    else if (IS_SOURCE_IMAGE (data->src))
+    {
+       fetchSrc = (scanFetchProc)fbFetchSourcePict;
+       srcClass = SourcePictureClassify ((source_image_t *)data->src,
+                                         data->xSrc, data->ySrc,
+                                         data->width, data->height);
+    }
+    else
+    {
+       bits_image_t *bits = (bits_image_t *)data->src;
+       
+       if (bits->common.alpha_map)
+       {
+           fetchSrc = (scanFetchProc)fbFetchExternalAlpha;
+       }
+       else if (bits->common.repeat == PIXMAN_REPEAT_NORMAL &&
+                bits->width == 1 &&
+                bits->height == 1)
+       {
+           fetchSrc = (scanFetchProc)fbFetchSolid;
+           srcClass = SOURCE_IMAGE_CLASS_HORIZONTAL;
+       }
+       else if (!bits->common.transform && bits->common.filter != PIXMAN_FILTER_CONVOLUTION)
+       {
+           fetchSrc = (scanFetchProc)fbFetch;
+       }
+       else
+       {
+           fetchSrc = (scanFetchProc)fbFetchTransformed;
+       }
+    }
+
+    if (!data->mask || data->op == PIXMAN_OP_CLEAR)
+    {
+       fetchMask = NULL;
+    }
+    else
+    {
+       if (IS_SOURCE_IMAGE (data->mask))
+       {
+           fetchMask = (scanFetchProc)fbFetchSourcePict;
+       }
+       else
+       {
+           bits_image_t *bits = (bits_image_t *)data->mask;
+           
+           if (bits->common.alpha_map)
+           {
+               fetchMask = (scanFetchProc)fbFetchExternalAlpha;
+               /* FIXME: this looks highly suspicious. Why would we
+                * expect the mask to be a source picture here? In fact
+                * we _know_ it's not a source picture.
+                *
+                * That's why it's commented out. This will result in
+                * the classification of "unknown" which should be
+                * correct.
+                */
+#if 0
+               maskClass = SourcePictureClassify (data->mask,
+                                                  data->xMask, data->yMask,
+                                                  data->width, data->height);
+#endif
+           }
+           else if (bits->common.repeat == PIXMAN_REPEAT_NORMAL &&
+                    bits->width == 1 && bits->height == 1)
+           {
+               fetchMask = (scanFetchProc)fbFetchSolid;
+               maskClass = SOURCE_IMAGE_CLASS_HORIZONTAL;
+           }
+           else if (!bits->common.transform && bits->common.filter != PIXMAN_FILTER_CONVOLUTION)
+               fetchMask = (scanFetchProc)fbFetch;
+           else
+               fetchMask = (scanFetchProc)fbFetchTransformed;
+       }
+    }
+    
+    if (data->dest->common.alpha_map)
+    {
+       fetchDest = (scanFetchProc)fbFetchExternalAlpha;
+       store = (scanStoreProc)fbStoreExternalAlpha;
+       
+       if (data->op == PIXMAN_OP_CLEAR || data->op == PIXMAN_OP_SRC)
+           fetchDest = NULL;
+    }
+    else
+    {
+       fetchDest = (scanFetchProc)fbFetch;
+       store = (scanStoreProc)fbStore;
+       
+       switch (data->op)
+       {
+       case PIXMAN_OP_CLEAR:
+       case PIXMAN_OP_SRC:
+           fetchDest = NULL;
+           /* fall-through */
+       case PIXMAN_OP_ADD:
+       case PIXMAN_OP_OVER:
+           switch (data->dest->bits.format) {
+           case PIXMAN_a8r8g8b8:
+           case PIXMAN_x8r8g8b8:
+               store = NULL;
+               break;
+           default:
+               break;
+           }
+           break;
+       }
+    }
+    
+    if (!store)
+    {
+#if 0
+       int bpp;
+       
+#if 0
+       fbGetDrawable (data->dest->pDrawable, bits, stride, bpp, xoff, yoff);
+#endif
+#endif
+       bits = data->dest->bits.bits;
+       stride = data->dest->bits.rowstride;
+       xoff = yoff = 0;
+    }
+    else
+    {
+       bits = NULL;
+       stride = 0;
+       xoff = yoff = 0;
+    }
+    
+    if (fetchSrc                  &&
+       fetchMask                  &&
+       data->mask                 &&
+       data->mask->common.type == BITS && 
+       data->mask->common.component_alpha &&
+       PIXMAN_FORMAT_RGB (data->mask->bits.format))
+    {
+       uint32_t *mask_buffer = dest_buffer + data->width;
+       CombineFuncC compose = composeFunctions.combineC[data->op];
+       if (!compose)
+           return;
+       
+       for (i = 0; i < data->height; ++i) {
+           /* fill first half of scanline with source */
+           if (fetchSrc)
+           {
+               if (fetchMask)
+               {
+                   /* fetch mask before source so that fetching of
+                      source can be optimized */
+                   fetchMask (data->mask, data->xMask, data->yMask + i,
+                              data->width, mask_buffer, 0, 0);
+                   
+                   if (maskClass == SOURCE_IMAGE_CLASS_HORIZONTAL)
+                       fetchMask = NULL;
+               }
+               
+               if (srcClass == SOURCE_IMAGE_CLASS_HORIZONTAL)
+               {
+                   fetchSrc (data->src, data->xSrc, data->ySrc + i,
+                             data->width, src_buffer, 0, 0);
+                   fetchSrc = NULL;
+               }
+               else
+               {
+                   fetchSrc (data->src, data->xSrc, data->ySrc + i,
+                             data->width, src_buffer, mask_buffer,
+                             0xffffffff);
+               }
+           }
+           else if (fetchMask)
+           {
+               fetchMask (data->mask, data->xMask, data->yMask + i,
+                          data->width, mask_buffer, 0, 0);
+           }
+           
+           if (store)
+           {
+               /* fill dest into second half of scanline */
+               if (fetchDest)
+                   fetchDest (data->dest, data->xDest, data->yDest + i,
+                              data->width, dest_buffer, 0, 0);
+               
+               /* blend */
+               compose (dest_buffer, src_buffer, mask_buffer, data->width);
+               
+               /* write back */
+               store (data->dest, data->xDest, data->yDest + i, data->width,
+                      dest_buffer);
+           }
+           else
+           {
+               /* blend */
+               compose (bits + (data->yDest + i+ yoff) * stride +
+                        data->xDest + xoff,
+                        src_buffer, mask_buffer, data->width);
+           }
+       }
+    }
+    else
+    {
+       uint32_t *src_mask_buffer = 0, *mask_buffer = 0;
+       CombineFuncU compose = composeFunctions.combineU[data->op];
+       if (!compose)
+           return;
+       
+       if (fetchMask)
+           mask_buffer = dest_buffer + data->width;
+       
+       for (i = 0; i < data->height; ++i) {
+           /* fill first half of scanline with source */
+           if (fetchSrc)
+           {
+               if (fetchMask)
+               {
+                   /* fetch mask before source so that fetching of
+                      source can be optimized */
+                   fetchMask (data->mask, data->xMask, data->yMask + i,
+                              data->width, mask_buffer, 0, 0);
+                   
+                   if (maskClass == SOURCE_IMAGE_CLASS_HORIZONTAL)
+                       fetchMask = NULL;
+               }
+               
+               if (srcClass == SOURCE_IMAGE_CLASS_HORIZONTAL)
+               {
+                   fetchSrc (data->src, data->xSrc, data->ySrc + i,
+                             data->width, src_buffer, 0, 0);
+                   
+                   if (mask_buffer)
+                   {
+                       fbCombineInU (mask_buffer, src_buffer, data->width);
+                       src_mask_buffer = mask_buffer;
+                   }
+                   else
+                       src_mask_buffer = src_buffer;
+                   
+                   fetchSrc = NULL;
+               }
+               else
+               {
+                   fetchSrc (data->src, data->xSrc, data->ySrc + i,
+                             data->width, src_buffer, mask_buffer,
+                             0xff000000);
+                   
+                   if (mask_buffer)
+                       composeFunctions.combineMaskU (src_buffer,
+                                                      mask_buffer,
+                                                      data->width);
+                   
+                   src_mask_buffer = src_buffer;
+               }
+           }
+           else if (fetchMask)
+           {
+               fetchMask (data->mask, data->xMask, data->yMask + i,
+                          data->width, mask_buffer, 0, 0);
+               
+               fbCombineInU (mask_buffer, src_buffer, data->width);
+               
+               src_mask_buffer = mask_buffer;
+           }
+           
+           if (store)
+           {
+               /* fill dest into second half of scanline */
+               if (fetchDest)
+                   fetchDest (data->dest, data->xDest, data->yDest + i,
+                              data->width, dest_buffer, 0, 0);
+               
+               /* blend */
+               compose (dest_buffer, src_mask_buffer, data->width);
+               
+               /* write back */
+               store (data->dest, data->xDest, data->yDest + i, data->width,
+                      dest_buffer);
+           }
+           else
+           {
+               /* blend */
+               compose (bits + (data->yDest + i+ yoff) * stride +
+                        data->xDest + xoff,
+                        src_mask_buffer, data->width);
+           }
+       }
+    }
+    
+    if (!store)
+       fbFinishAccess (data->dest->pDrawable);
+}
index 8d82261..ceed94a 100644 (file)
@@ -33,8 +33,22 @@ enum
 };
 
 static void
+init_common (image_common_t *common)
+{
+    common->transform = NULL;
+    common->clip_region = NULL;
+    common->repeat = PIXMAN_REPEAT_NONE;
+    common->filter = PIXMAN_FILTER_NEAREST;
+    common->filter_params = NULL;
+    common->filter_nparams = 0;
+    common->alpha_map = NULL;
+    common->component_alpha = FALSE;
+}
+
+static void
 init_source_image (source_image_t *image)
 {
+    init_common (&image->common);
     image->class = SOURCE_IMAGE_CLASS_UNKNOWN;
 }
 
@@ -103,6 +117,9 @@ pixman_image_init_solid_fill (pixman_image_t *image,
                              int            *error)
 {
     image_t *priv = (image_t *)image;
+    
+    init_source_image (&priv->solid.common);
+    
     priv->type = SOLID;
     priv->solid.color = color_to_uint32 (color);
 }
@@ -202,17 +219,25 @@ pixman_image_init_bits (pixman_image_t         *image,
                        pixman_format_code_t    format,
                        int                     width,
                        int                     height,
-                       uint8_t                *bits,
+                       uint32_t               *bits,
                        int                     rowstride)
 {
     image_t *img = (image_t *)image;
 
+    init_common (&img->common);
+
+    if (rowstride & 0x3)
+    {
+       /* we should probably spew some warning here */
+    }
+    
     img->type = BITS;
     img->bits.format = format;
     img->bits.width = width;
     img->bits.height = height;
     img->bits.bits = bits;
-    img->bits.rowstride = rowstride;
+    img->bits.rowstride = rowstride / 4; /* we store it in number of uint32_t's */
+    img->bits.indexed = NULL;
 }
 
 void
index e290e51..74a95c7 100644 (file)
@@ -8,7 +8,29 @@
 #define TRUE 1
 #endif
 
+/* FIXME - the types and structures below should be give proper names
+ */
+
+#define FASTCALL
+typedef FASTCALL void (*CombineMaskU) (uint32_t *src, const uint32_t *mask, int width);
+typedef FASTCALL void (*CombineFuncU) (uint32_t *dest, const uint32_t *src, int width);
+typedef FASTCALL void (*CombineFuncC) (uint32_t *dest, uint32_t *src, uint32_t *mask, int width);
+
+typedef struct _FbComposeFunctions {
+    CombineFuncU *combineU;
+    CombineFuncC *combineC;
+    CombineMaskU combineMaskU;
+} FbComposeFunctions;
+
+
+#define fbGetDrawable 
+
+
+/* end */
+
+
 typedef union  image image_t;
+typedef struct image_common image_common_t;
 typedef struct source_image source_image_t;
 typedef struct solid_fill solid_fill_t;
 typedef struct gradient gradient_t;
@@ -20,6 +42,7 @@ typedef struct radial_gradient radial_gradient_t;
 typedef struct bits_image bits_image_t;
 typedef struct gradient_stop gradient_stop_t;
 typedef struct circle circle_t;
+typedef struct point point_t;
 
 typedef enum
 {
@@ -30,11 +53,7 @@ typedef enum
     SOLID
 } image_type_t;
 
-struct gradient_stop
-{
-    pixman_fixed_t x;
-    pixman_color_t color;
-};
+#define IS_SOURCE_IMAGE(img)     (((image_common_t *)img)->type > BITS)
 
 typedef enum
 {
@@ -43,9 +62,34 @@ typedef enum
     SOURCE_IMAGE_CLASS_VERTICAL
 } source_pict_class_t;
 
-struct source_image
+struct point
+{
+    int16_t x, y;
+};
+
+struct image_common
 {
     image_type_t       type;
+    pixman_transform_t *transform;
+    pixman_region16_t  *clip_region;
+    pixman_repeat_t    repeat;
+    pixman_filter_t    filter;
+    pixman_fixed_t     *filter_params;
+    int                        filter_nparams;
+    bits_image_t       *alpha_map;
+    point_t            alpha_origin;
+    pixman_bool_t      component_alpha;
+};
+
+struct gradient_stop
+{
+    pixman_fixed_t x;
+    pixman_color_t color;
+};
+
+struct source_image
+{
+    image_common_t     common;
     unsigned int       class;          /* FIXME: should be an enum */
 };
 
@@ -100,17 +144,19 @@ struct conical_gradient
 
 struct bits_image
 {
-    image_type_t               type;
+    image_common_t             common;
     pixman_format_code_t       format;
+    pixman_indexed_t          *indexed;
     int                                width;
     int                                height;
-    uint8_t *                  bits;
+    uint32_t *                 bits;
     int                                rowstride; /* in bytes */
 };
 
 union image
 {
     image_type_t               type;
+    image_common_t             common;
     bits_image_t               bits;
     linear_gradient_t          linear;
     conical_gradient_t         conical;
index 23a0397..ec4da71 100644 (file)
@@ -341,7 +341,7 @@ pixman_region_fini (pixman_region16_t *region)
 }
 
 int
-pixman_region_num_rects (pixman_region16_t *region)
+pixman_region_n_rects (pixman_region16_t *region)
 {
     return PIXREGION_NUM_RECTS (region);
 }
@@ -1973,7 +1973,7 @@ pixman_region_inverse(pixman_region16_t *           newReg,       /* Destination region
  *   that doesn't overlap the box at all and partIn is false)
  */
 
-int
+pixman_region_overlap_t
 pixman_region_contains_rectangle(pixman_region16_t *  region,
                                 pixman_box16_t *     prect)
 {
index 78d9b4c..b9179e5 100644 (file)
@@ -112,7 +112,7 @@ typedef pixman_fixed_16_16_t        pixman_fixed_t;
 #define pixman_fixed_1                 (pixman_int_to_fixed(1))
 #define pixman_fixed_1_minus_e         (pixman_fixed_1 - pixman_fixed_e)
 #define pixman_fixed_to_int(f)         ((int) ((f) >> 16))
-#define pixman_int_to_fixed(i)         ((pixman_fixed) ((i) << 16))
+#define pixman_int_to_fixed(i)         ((pixman_fixed_t) ((i) << 16))
 #define pixman_fixed_to_double(f)      (double) ((f) / (double) pixman_fixed_1)
 #define pixman_fixed_frac(f)           ((f) & pixman_fixed_1_minus_e)
 #define pixman_fixed_floor(f)          ((f) & ~pixman_fixed_1_minus_e)
@@ -126,6 +126,8 @@ typedef pixman_fixed_16_16_t        pixman_fixed_t;
 typedef struct pixman_color pixman_color_t;
 typedef struct pixman_point_fixed pixman_point_fixed_t;
 typedef struct pixman_line_fixed pixman_line_fixed_t;
+typedef struct pixman_vector pixman_vector_t;
+typedef struct pixman_transform pixman_transform_t;
 
 struct pixman_color
 {
@@ -146,6 +148,53 @@ struct pixman_line_fixed
     pixman_point_fixed_t       p1, p2;
 };
 
+struct pixman_vector
+{
+    pixman_fixed_t     vector[3];
+};
+
+struct pixman_transform
+{
+    pixman_fixed_t     matrix[3][3];
+};
+
+/* Don't blame me, blame XRender */
+typedef enum
+{
+    PIXMAN_REPEAT_NONE,
+    PIXMAN_REPEAT_NORMAL,
+    PIXMAN_REPEAT_PAD,
+    PIXMAN_REPEAT_REFLECT
+} pixman_repeat_t;
+
+typedef enum
+{
+    PIXMAN_FILTER_FAST,
+    PIXMAN_FILTER_GOOD,
+    PIXMAN_FILTER_BEST,
+    PIXMAN_FILTER_NEAREST,
+    PIXMAN_FILTER_BILINEAR,
+    PIXMAN_FILTER_CONVOLUTION
+} pixman_filter_t;
+
+typedef enum
+{
+    PIXMAN_OP_CLEAR,
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_DST,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_OVER_REVERSE,
+    PIXMAN_OP_IN,
+    PIXMAN_OP_IN_REVERSE,
+    PIXMAN_OP_OUT,
+    PIXMAN_OP_OUT_REVERSE,
+    PIXMAN_OP_ATOP,
+    PIXMAN_OP_ATOP_REVERSE,
+    PIXMAN_OP_XOR,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_SATURATE
+} pixman_op_t;
+
 /*
  * Regions
  */
@@ -172,45 +221,66 @@ typedef enum
 } pixman_region_overlap_t;
 
 /* creation/destruction */
-void pixman_region_init              (pixman_region16_t *region);
-void pixman_region_init_rect         (pixman_region16_t *region,
-                                     int                x,
-                                     int                y,
-                                     unsigned int       width,
-                                     unsigned int       height);
-void pixman_region_init_with_extents (pixman_region16_t *region,
-                                     pixman_box16_t    *extents);
-void pixman_region_fini              (pixman_region16_t *region);
+void                    pixman_region_init              (pixman_region16_t *region);
+void                    pixman_region_init_rect         (pixman_region16_t *region,
+                                                        int                x,
+                                                        int                y,
+                                                        unsigned int       width,
+                                                        unsigned int       height);
+void                    pixman_region_init_with_extents (pixman_region16_t *region,
+                                                        pixman_box16_t    *extents);
+void                    pixman_region_fini              (pixman_region16_t *region);
 
 /* manipulation */
-void          pixman_region_translate  (pixman_region16_t *region,
-                                       int                x,
-                                       int                y);
-pixman_bool_t pixman_region_copy       (pixman_region16_t *dest,
-                                       pixman_region16_t *source);
-pixman_bool_t pixman_region_intersect  (pixman_region16_t *newReg,
-                                       pixman_region16_t *reg1,
-                                       pixman_region16_t *reg2);
-pixman_bool_t pixman_region_union      (pixman_region16_t *newReg,
-                                       pixman_region16_t *reg1,
-                                       pixman_region16_t *reg2);
-pixman_bool_t pixman_region_union_rect (pixman_region16_t *dest,
-                                       pixman_region16_t *source,
-                                       int                x,
-                                       int                y,
-                                       unsigned int       width,
-                                       unsigned int       height);
-pixman_bool_t pixman_region_subtract   (pixman_region16_t *regD,
-                                       pixman_region16_t *regM,
-                                       pixman_region16_t *regS);
-pixman_bool_t pixman_region_inverse    (pixman_region16_t *newReg,
-                                       pixman_region16_t *reg1,
-                                       pixman_box16_t    *invRect);
-
+void                    pixman_region_translate  (pixman_region16_t *region,
+                                                 int                x,
+                                                 int                y);
+pixman_bool_t           pixman_region_copy       (pixman_region16_t *dest,
+                                                 pixman_region16_t *source);
+pixman_bool_t           pixman_region_intersect  (pixman_region16_t *newReg,
+                                                 pixman_region16_t *reg1,
+                                                 pixman_region16_t *reg2);
+pixman_bool_t           pixman_region_union      (pixman_region16_t *newReg,
+                                                 pixman_region16_t *reg1,
+                                                 pixman_region16_t *reg2);
+pixman_bool_t           pixman_region_union_rect (pixman_region16_t *dest,
+                                                 pixman_region16_t *source,
+                                                 int                x,
+                                                 int                y,
+                                                 unsigned int       width,
+                                                 unsigned int       height);
+pixman_bool_t           pixman_region_subtract   (pixman_region16_t *regD,
+                                                 pixman_region16_t *regM,
+                                                 pixman_region16_t *regS);
+pixman_bool_t           pixman_region_inverse    (pixman_region16_t *newReg,
+                                                 pixman_region16_t *reg1,
+                                                 pixman_box16_t    *invRect);
+pixman_bool_t           pixman_region_contains_point (pixman_region16_t *region, int x, int y, pixman_box16_t *box);
+pixman_region_overlap_t pixman_region_contains_rectangle (pixman_region16_t *pixman_region16_t, pixman_box16_t *prect);
+pixman_bool_t           pixman_region_not_empty (pixman_region16_t *region);
+pixman_box16_t *        pixman_region_extents (pixman_region16_t *region);
+int                     pixman_region_n_rects (pixman_region16_t *region);
+const pixman_box16_t *  pixman_region_rectangles (pixman_region16_t *region,
+                                                 int               *n_rects);
 
 /*
  * Images
  */
+typedef struct pixman_indexed  pixman_indexed_t;
+
+#define PIXMAN_MAX_INDEXED  256 /* XXX depth must be <= 8 */
+
+#if PIXMAN_MAX_INDEXED <= 256
+typedef uint8_t pixman_index_type;
+#endif
+
+struct pixman_indexed
+{
+    pixman_bool_t       color;
+    uint32_t           rgba[PIXMAN_MAX_INDEXED];
+    pixman_index_type  ent[32768];
+};
+
 /*
  * While the protocol is generous in format support, the
  * sample implementation allows only packed RGB and GBR
@@ -218,9 +288,9 @@ pixman_bool_t pixman_region_inverse    (pixman_region16_t *newReg,
  */
 #define PIXMAN_FORMAT(bpp,type,a,r,g,b)        (((bpp) << 24) |  \
                                         ((type) << 16) | \
-                                        ((a) << 12) | \
-                                        ((r) << 8) | \
-                                        ((g) << 4) | \
+                                        ((a) << 12) |    \
+                                        ((r) << 8) |     \
+                                        ((g) << 4) |     \
                                         ((b)))
 
 #define PIXMAN_FORMAT_BPP(f)   (((f) >> 24)       )
@@ -243,57 +313,57 @@ pixman_bool_t pixman_region_inverse    (pixman_region16_t *newReg,
 
 /* 32bpp formats */
 typedef enum {
-   PIXMAN_a8r8g8b8 =   PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,8,8,8,8),
-   PIXMAN_x8r8g8b8 =   PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,0,8,8,8),
-   PIXMAN_a8b8g8r8 =   PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,8,8,8,8),
-   PIXMAN_x8b8g8r8 =   PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,0,8,8,8),
-
+    PIXMAN_a8r8g8b8 =  PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,8,8,8,8),
+    PIXMAN_x8r8g8b8 =  PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,0,8,8,8),
+    PIXMAN_a8b8g8r8 =  PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,8,8,8,8),
+    PIXMAN_x8b8g8r8 =  PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,0,8,8,8),
+    
 /* 24bpp formats */
-   PIXMAN_r8g8b8 =     PIXMAN_FORMAT(24,PIXMAN_TYPE_ARGB,0,8,8,8),
-   PIXMAN_b8g8r8 =     PIXMAN_FORMAT(24,PIXMAN_TYPE_ABGR,0,8,8,8),
-
+    PIXMAN_r8g8b8 =    PIXMAN_FORMAT(24,PIXMAN_TYPE_ARGB,0,8,8,8),
+    PIXMAN_b8g8r8 =    PIXMAN_FORMAT(24,PIXMAN_TYPE_ABGR,0,8,8,8),
+    
 /* 16bpp formats */
-   PIXMAN_r5g6b5 =     PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,5,6,5),
-   PIXMAN_b5g6r5 =     PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,5,6,5),
-
-   PIXMAN_a1r5g5b5 =   PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,1,5,5,5),
-   PIXMAN_x1r5g5b5 =   PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,5,5,5),
-   PIXMAN_a1b5g5r5 =   PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,1,5,5,5),
-   PIXMAN_x1b5g5r5 =   PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,5,5,5),
-   PIXMAN_a4r4g4b4 =   PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,4,4,4,4),
-   PIXMAN_x4r4g4b4 =   PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,4,4,4),
-   PIXMAN_a4b4g4r4 =   PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,4,4,4,4),
-   PIXMAN_x4b4g4r4 =   PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,4,4,4),
-
+    PIXMAN_r5g6b5 =    PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,5,6,5),
+    PIXMAN_b5g6r5 =    PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,5,6,5),
+    
+    PIXMAN_a1r5g5b5 =  PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,1,5,5,5),
+    PIXMAN_x1r5g5b5 =  PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,5,5,5),
+    PIXMAN_a1b5g5r5 =  PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,1,5,5,5),
+    PIXMAN_x1b5g5r5 =  PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,5,5,5),
+    PIXMAN_a4r4g4b4 =  PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,4,4,4,4),
+    PIXMAN_x4r4g4b4 =  PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,4,4,4),
+    PIXMAN_a4b4g4r4 =  PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,4,4,4,4),
+    PIXMAN_x4b4g4r4 =  PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,4,4,4),
+    
 /* 8bpp formats */
-   PIXMAN_a8 =         PIXMAN_FORMAT(8,PIXMAN_TYPE_A,8,0,0,0),
-   PIXMAN_r3g3b2 =     PIXMAN_FORMAT(8,PIXMAN_TYPE_ARGB,0,3,3,2),
-   PIXMAN_b2g3r3 =     PIXMAN_FORMAT(8,PIXMAN_TYPE_ABGR,0,3,3,2),
-   PIXMAN_a2r2g2b2 =   PIXMAN_FORMAT(8,PIXMAN_TYPE_ARGB,2,2,2,2),
-   PIXMAN_a2b2g2r2 =   PIXMAN_FORMAT(8,PIXMAN_TYPE_ABGR,2,2,2,2),
-
-   PIXMAN_c8 =         PIXMAN_FORMAT(8,PIXMAN_TYPE_COLOR,0,0,0,0),
-   PIXMAN_g8 =         PIXMAN_FORMAT(8,PIXMAN_TYPE_GRAY,0,0,0,0),
-
-   PIXMAN_x4a4 =       PIXMAN_FORMAT(8,PIXMAN_TYPE_A,4,0,0,0),
-                                   
-   PIXMAN_x4c4 =       PIXMAN_FORMAT(8,PIXMAN_TYPE_COLOR,0,0,0,0),
-   PIXMAN_x4g4 =       PIXMAN_FORMAT(8,PIXMAN_TYPE_GRAY,0,0,0,0),
-
+    PIXMAN_a8 =                PIXMAN_FORMAT(8,PIXMAN_TYPE_A,8,0,0,0),
+    PIXMAN_r3g3b2 =    PIXMAN_FORMAT(8,PIXMAN_TYPE_ARGB,0,3,3,2),
+    PIXMAN_b2g3r3 =    PIXMAN_FORMAT(8,PIXMAN_TYPE_ABGR,0,3,3,2),
+    PIXMAN_a2r2g2b2 =  PIXMAN_FORMAT(8,PIXMAN_TYPE_ARGB,2,2,2,2),
+    PIXMAN_a2b2g2r2 =  PIXMAN_FORMAT(8,PIXMAN_TYPE_ABGR,2,2,2,2),
+    
+    PIXMAN_c8 =                PIXMAN_FORMAT(8,PIXMAN_TYPE_COLOR,0,0,0,0),
+    PIXMAN_g8 =                PIXMAN_FORMAT(8,PIXMAN_TYPE_GRAY,0,0,0,0),
+    
+    PIXMAN_x4a4 =      PIXMAN_FORMAT(8,PIXMAN_TYPE_A,4,0,0,0),
+    
+    PIXMAN_x4c4 =      PIXMAN_FORMAT(8,PIXMAN_TYPE_COLOR,0,0,0,0),
+    PIXMAN_x4g4 =      PIXMAN_FORMAT(8,PIXMAN_TYPE_GRAY,0,0,0,0),
+    
 /* 4bpp formats */
-   PIXMAN_a4 =         PIXMAN_FORMAT(4,PIXMAN_TYPE_A,4,0,0,0),
-   PIXMAN_r1g2b1 =     PIXMAN_FORMAT(4,PIXMAN_TYPE_ARGB,0,1,2,1),
-   PIXMAN_b1g2r1 =     PIXMAN_FORMAT(4,PIXMAN_TYPE_ABGR,0,1,2,1),
-   PIXMAN_a1r1g1b1 =   PIXMAN_FORMAT(4,PIXMAN_TYPE_ARGB,1,1,1,1),
-   PIXMAN_a1b1g1r1 =   PIXMAN_FORMAT(4,PIXMAN_TYPE_ABGR,1,1,1,1),
-                                   
-   PIXMAN_c4 =         PIXMAN_FORMAT(4,PIXMAN_TYPE_COLOR,0,0,0,0),
-   PIXMAN_g4 =         PIXMAN_FORMAT(4,PIXMAN_TYPE_GRAY,0,0,0,0),
-
+    PIXMAN_a4 =                PIXMAN_FORMAT(4,PIXMAN_TYPE_A,4,0,0,0),
+    PIXMAN_r1g2b1 =    PIXMAN_FORMAT(4,PIXMAN_TYPE_ARGB,0,1,2,1),
+    PIXMAN_b1g2r1 =    PIXMAN_FORMAT(4,PIXMAN_TYPE_ABGR,0,1,2,1),
+    PIXMAN_a1r1g1b1 =  PIXMAN_FORMAT(4,PIXMAN_TYPE_ARGB,1,1,1,1),
+    PIXMAN_a1b1g1r1 =  PIXMAN_FORMAT(4,PIXMAN_TYPE_ABGR,1,1,1,1),
+    
+    PIXMAN_c4 =                PIXMAN_FORMAT(4,PIXMAN_TYPE_COLOR,0,0,0,0),
+    PIXMAN_g4 =                PIXMAN_FORMAT(4,PIXMAN_TYPE_GRAY,0,0,0,0),
+    
 /* 1bpp formats */
-   PIXMAN_a1 =         PIXMAN_FORMAT(1,PIXMAN_TYPE_A,1,0,0,0),
-
-   PIXMAN_g1 =         PIXMAN_FORMAT(1,PIXMAN_TYPE_GRAY,0,0,0,0),
+    PIXMAN_a1 =                PIXMAN_FORMAT(1,PIXMAN_TYPE_A,1,0,0,0),
+    
+    PIXMAN_g1 =                PIXMAN_FORMAT(1,PIXMAN_TYPE_GRAY,0,0,0,0),
 } pixman_format_code_t;
 
 typedef struct
@@ -332,10 +402,23 @@ void pixman_image_init_bits             (pixman_image_t       *image,
                                         pixman_format_code_t  format,
                                         int                   width,
                                         int                   height,
-                                        uint8_t              *bits,
-                                        int                   rowstride);
+                                        uint32_t             *bits,
+                                        int                   rowstride); /* in bytes */
 void pixman_image_set_clip_region       (pixman_image_t       *image,
                                         pixman_region16_t    *region);
+void pixman_image_composite            (pixman_op_t           op,
+                                        pixman_image_t       *src,
+                                        pixman_image_t       *mask,
+                                        pixman_image_t       *dest,
+                                        int                   src_x,
+                                        int                   src_y,
+                                        int                   mask_x,
+                                        int                   mask_y,
+                                        int                    dest_x,
+                                        int                    dest_y,
+                                        int                    width,
+                                        int                    height);
+
 
 
 #endif /* PIXMAN_H__ */