mmx: add add_0565_0565
[profile/ivi/pixman.git] / pixman / pixman-sse2.c
index c81836a..e217ca3 100644 (file)
 #include <config.h>
 #endif
 
-#include <mmintrin.h>
 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
 #include <emmintrin.h> /* for SSE2 intrinsics */
 #include "pixman-private.h"
 #include "pixman-combine32.h"
-
-#if defined(_MSC_VER) && defined(_M_AMD64)
-/* Windows 64 doesn't allow MMX to be used, so
- * the pixman-x64-mmx-emulation.h file contains
- * implementations of those MMX intrinsics that
- * are used in the SSE2 implementation.
- */
-#   include "pixman-x64-mmx-emulation.h"
-#endif
-
-#ifdef USE_SSE2
-
-/* --------------------------------------------------------------------
- * Locals
- */
-
-static __m64 mask_x0080;
-static __m64 mask_x00ff;
-static __m64 mask_x0101;
-static __m64 mask_x_alpha;
-
-static __m64 mask_x565_rgb;
-static __m64 mask_x565_unpack;
+#include "pixman-inlines.h"
 
 static __m128i mask_0080;
 static __m128i mask_00ff;
@@ -76,9 +53,6 @@ static __m128i mask_blue;
 static __m128i mask_565_fix_rb;
 static __m128i mask_565_fix_g;
 
-/* ----------------------------------------------------------------------
- * SSE2 Inlines
- */
 static force_inline __m128i
 unpack_32_1x128 (uint32_t data)
 {
@@ -356,18 +330,6 @@ in_over_2x128 (__m128i* src_lo,
     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
 }
 
-static force_inline void
-cache_prefetch (__m128i* addr)
-{
-    _mm_prefetch ((void const*)addr, _MM_HINT_T0);
-}
-
-static force_inline void
-cache_prefetch_next (__m128i* addr)
-{
-    _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */
-}
-
 /* load 4 pixels from a 16-byte boundary aligned address */
 static force_inline __m128i
 load_128_aligned (__m128i* src)
@@ -408,140 +370,104 @@ save_128_unaligned (__m128i* dst,
     _mm_storeu_si128 (dst, data);
 }
 
-/* ------------------------------------------------------------------
- * MMX inlines
- */
-
-static force_inline __m64
-unpack_32_1x64 (uint32_t data)
-{
-    return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64 ());
-}
-
-static force_inline __m64
-expand_alpha_1x64 (__m64 data)
+static force_inline __m128i
+load_32_1x128 (uint32_t data)
 {
-    return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
+    return _mm_cvtsi32_si128 (data);
 }
 
-static force_inline __m64
-expand_alpha_rev_1x64 (__m64 data)
+static force_inline __m128i
+expand_alpha_rev_1x128 (__m128i data)
 {
-    return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
+    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
 }
 
-static force_inline __m64
-expand_pixel_8_1x64 (uint8_t data)
+static force_inline __m128i
+expand_pixel_8_1x128 (uint8_t data)
 {
-    return _mm_shuffle_pi16 (
-       unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
+    return _mm_shufflelo_epi16 (
+       unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
 }
 
-static force_inline __m64
-pix_multiply_1x64 (__m64 data,
-                   __m64 alpha)
+static force_inline __m128i
+pix_multiply_1x128 (__m128i data,
+                   __m128i alpha)
 {
-    return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
-                                          mask_x0080),
-                           mask_x0101);
+    return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
+                                           mask_0080),
+                           mask_0101);
 }
 
-static force_inline __m64
-pix_add_multiply_1x64 (__m64* src,
-                       __m64* alpha_dst,
-                       __m64* dst,
-                       __m64* alpha_src)
+static force_inline __m128i
+pix_add_multiply_1x128 (__m128i* src,
+                       __m128i* alpha_dst,
+                       __m128i* dst,
+                       __m128i* alpha_src)
 {
-    __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
-    __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
+    __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
+    __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
 
-    return _mm_adds_pu8 (t1, t2);
+    return _mm_adds_epu8 (t1, t2);
 }
 
-static force_inline __m64
-negate_1x64 (__m64 data)
+static force_inline __m128i
+negate_1x128 (__m128i data)
 {
-    return _mm_xor_si64 (data, mask_x00ff);
+    return _mm_xor_si128 (data, mask_00ff);
 }
 
-static force_inline __m64
-invert_colors_1x64 (__m64 data)
+static force_inline __m128i
+invert_colors_1x128 (__m128i data)
 {
-    return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
+    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
 }
 
-static force_inline __m64
-over_1x64 (__m64 src, __m64 alpha, __m64 dst)
+static force_inline __m128i
+over_1x128 (__m128i src, __m128i alpha, __m128i dst)
 {
-    return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
+    return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
 }
 
-static force_inline __m64
-in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
+static force_inline __m128i
+in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
 {
-    return over_1x64 (pix_multiply_1x64 (*src, *mask),
-                      pix_multiply_1x64 (*alpha, *mask),
-                      *dst);
+    return over_1x128 (pix_multiply_1x128 (*src, *mask),
+                      pix_multiply_1x128 (*alpha, *mask),
+                      *dst);
 }
 
-static force_inline __m64
-over_rev_non_pre_1x64 (__m64 src, __m64 dst)
+static force_inline __m128i
+over_rev_non_pre_1x128 (__m128i src, __m128i dst)
 {
-    __m64 alpha = expand_alpha_1x64 (src);
+    __m128i alpha = expand_alpha_1x128 (src);
 
-    return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
-                                         _mm_or_si64 (alpha, mask_x_alpha)),
-                      alpha,
-                      dst);
+    return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
+                                          _mm_or_si128 (alpha, mask_alpha)),
+                      alpha,
+                      dst);
 }
 
 static force_inline uint32_t
-pack_1x64_32 (__m64 data)
+pack_1x128_32 (__m128i data)
 {
-    return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
+    return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
 }
 
-/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
- *
- *    00RR00GG00BB
- *
- * --- Expanding 565 in the low word ---
- *
- * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
- * m = m & (01f0003f001f);
- * m = m * (008404100840);
- * m = m >> 8;
- *
- * Note the trick here - the top word is shifted by another nibble to
- * avoid it bumping into the middle word
- */
-static force_inline __m64
-expand565_16_1x64 (uint16_t pixel)
+static force_inline __m128i
+expand565_16_1x128 (uint16_t pixel)
 {
-    __m64 p;
-    __m64 t1, t2;
-
-    p = _mm_cvtsi32_si64 ((uint32_t) pixel);
-
-    t1 = _mm_slli_si64 (p, 36 - 11);
-    t2 = _mm_slli_si64 (p, 16 - 5);
+    __m128i m = _mm_cvtsi32_si128 (pixel);
 
-    p = _mm_or_si64 (t1, p);
-    p = _mm_or_si64 (t2, p);
-    p = _mm_and_si64 (p, mask_x565_rgb);
-    p = _mm_mullo_pi16 (p, mask_x565_unpack);
+    m = unpack_565_to_8888 (m);
 
-    return _mm_srli_pi16 (p, 8);
+    return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
 }
 
-/* ----------------------------------------------------------------------------
- * Compose Core transformations
- */
 static force_inline uint32_t
 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
 {
     uint8_t a;
-    __m64 ms;
+    __m128i xmms;
 
     a = src >> 24;
 
@@ -551,9 +477,10 @@ core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
     }
     else if (src)
     {
-       ms = unpack_32_1x64 (src);
-       return pack_1x64_32 (
-           over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
+       xmms = unpack_32_1x128 (src);
+       return pack_1x128_32 (
+           over_1x128 (xmms, expand_alpha_1x128 (xmms),
+                       unpack_32_1x128 (dst)));
     }
 
     return dst;
@@ -566,15 +493,15 @@ combine1 (const uint32_t *ps, const uint32_t *pm)
 
     if (pm)
     {
-       __m64 ms, mm;
+       __m128i ms, mm;
 
-       mm = unpack_32_1x64 (*pm);
-       mm = expand_alpha_1x64 (mm);
+       mm = unpack_32_1x128 (*pm);
+       mm = expand_alpha_1x128 (mm);
 
-       ms = unpack_32_1x64 (s);
-       ms = pix_multiply_1x64 (ms, mm);
+       ms = unpack_32_1x128 (s);
+       ms = pix_multiply_1x128 (ms, mm);
 
-       s = pack_1x64_32 (ms);
+       s = pack_1x128_32 (ms);
     }
 
     return s;
@@ -615,101 +542,182 @@ combine4 (const __m128i *ps, const __m128i *pm)
 }
 
 static force_inline void
-core_combine_over_u_sse2 (uint32_t*       pd,
-                          const uint32_t* ps,
-                          const uint32_t* pm,
-                          int             w)
+core_combine_over_u_sse2_mask (uint32_t *        pd,
+                              const uint32_t*    ps,
+                              const uint32_t*    pm,
+                              int                w)
 {
     uint32_t s, d;
 
-    __m128i xmm_dst_lo, xmm_dst_hi;
-    __m128i xmm_src_lo, xmm_src_hi;
-    __m128i xmm_alpha_lo, xmm_alpha_hi;
-
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     /* Align dst on a 16-byte boundary */
     while (w && ((unsigned long)pd & 15))
     {
        d = *pd;
        s = combine1 (ps, pm);
 
-       *pd++ = core_combine_over_u_pixel_sse2 (s, d);
+       if (s)
+           *pd = core_combine_over_u_pixel_sse2 (s, d);
+       pd++;
        ps++;
-       if (pm)
-           pm++;
+       pm++;
        w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-       /* fill cache line with next memory */
-       cache_prefetch_next ((__m128i*)ps);
-       cache_prefetch_next ((__m128i*)pd);
-       cache_prefetch_next ((__m128i*)pm);
-
-       /* I'm loading unaligned because I'm not sure about
-        * the address alignment.
-        */
-       xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+       __m128i mask = load_128_unaligned ((__m128i *)pm);
 
-       if (is_opaque (xmm_src_hi))
-       {
-           save_128_aligned ((__m128i*)pd, xmm_src_hi);
-       }
-       else if (!is_zero (xmm_src_hi))
+       if (!is_zero (mask))
        {
-           xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+           __m128i src;
+           __m128i src_hi, src_lo;
+           __m128i mask_hi, mask_lo;
+           __m128i alpha_hi, alpha_lo;
 
-           unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-           unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+           src = load_128_unaligned ((__m128i *)ps);
 
-           expand_alpha_2x128 (
-               xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+           if (is_opaque (_mm_and_si128 (src, mask)))
+           {
+               save_128_aligned ((__m128i *)pd, src);
+           }
+           else
+           {
+               __m128i dst = load_128_aligned ((__m128i *)pd);
+               __m128i dst_hi, dst_lo;
 
-           over_2x128 (&xmm_src_lo, &xmm_src_hi,
-                       &xmm_alpha_lo, &xmm_alpha_hi,
-                       &xmm_dst_lo, &xmm_dst_hi);
+               unpack_128_2x128 (mask, &mask_lo, &mask_hi);
+               unpack_128_2x128 (src, &src_lo, &src_hi);
 
-           /* rebuid the 4 pixel data and save*/
-           save_128_aligned ((__m128i*)pd,
-                             pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+               expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
+               pix_multiply_2x128 (&src_lo, &src_hi,
+                                   &mask_lo, &mask_hi,
+                                   &src_lo, &src_hi);
+
+               unpack_128_2x128 (dst, &dst_lo, &dst_hi);
+
+               expand_alpha_2x128 (src_lo, src_hi,
+                                   &alpha_lo, &alpha_hi);
+
+               over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
+                           &dst_lo, &dst_hi);
+
+               save_128_aligned (
+                   (__m128i *)pd,
+                   pack_2x128_128 (dst_lo, dst_hi));
+           }
        }
 
-       w -= 4;
+       pm += 4;
        ps += 4;
        pd += 4;
-       if (pm)
-           pm += 4;
+       w -= 4;
     }
-
     while (w)
     {
        d = *pd;
        s = combine1 (ps, pm);
 
-       *pd++ = core_combine_over_u_pixel_sse2 (s, d);
+       if (s)
+           *pd = core_combine_over_u_pixel_sse2 (s, d);
+       pd++;
+       ps++;
+       pm++;
+
+       w--;
+    }
+}
+
+static force_inline void
+core_combine_over_u_sse2_no_mask (uint32_t *     pd,
+                                 const uint32_t*    ps,
+                                 int                w)
+{
+    uint32_t s, d;
+
+    /* Align dst on a 16-byte boundary */
+    while (w && ((unsigned long)pd & 15))
+    {
+       d = *pd;
+       s = *ps;
+
+       if (s)
+           *pd = core_combine_over_u_pixel_sse2 (s, d);
+       pd++;
+       ps++;
+       w--;
+    }
+
+    while (w >= 4)
+    {
+       __m128i src;
+       __m128i src_hi, src_lo, dst_hi, dst_lo;
+       __m128i alpha_hi, alpha_lo;
+
+       src = load_128_unaligned ((__m128i *)ps);
+
+       if (!is_zero (src))
+       {
+           if (is_opaque (src))
+           {
+               save_128_aligned ((__m128i *)pd, src);
+           }
+           else
+           {
+               __m128i dst = load_128_aligned ((__m128i *)pd);
+
+               unpack_128_2x128 (src, &src_lo, &src_hi);
+               unpack_128_2x128 (dst, &dst_lo, &dst_hi);
+
+               expand_alpha_2x128 (src_lo, src_hi,
+                                   &alpha_lo, &alpha_hi);
+               over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
+                           &dst_lo, &dst_hi);
+
+               save_128_aligned (
+                   (__m128i *)pd,
+                   pack_2x128_128 (dst_lo, dst_hi));
+           }
+       }
+
+       ps += 4;
+       pd += 4;
+       w -= 4;
+    }
+    while (w)
+    {
+       d = *pd;
+       s = *ps;
+
+       if (s)
+           *pd = core_combine_over_u_pixel_sse2 (s, d);
+       pd++;
        ps++;
-       if (pm)
-           pm++;
 
        w--;
     }
 }
 
 static force_inline void
-core_combine_over_reverse_u_sse2 (uint32_t*       pd,
-                                  const uint32_t* ps,
-                                  const uint32_t* pm,
-                                  int             w)
+sse2_combine_over_u (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    if (pm)
+       core_combine_over_u_sse2_mask (pd, ps, pm, w);
+    else
+       core_combine_over_u_sse2_no_mask (pd, ps, w);
+}
+
+static void
+sse2_combine_over_reverse_u (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               pd,
+                             const uint32_t *         ps,
+                             const uint32_t *         pm,
+                             int                      w)
 {
     uint32_t s, d;
 
@@ -717,11 +725,6 @@ core_combine_over_reverse_u_sse2 (uint32_t*       pd,
     __m128i xmm_src_lo, xmm_src_hi;
     __m128i xmm_alpha_lo, xmm_alpha_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     /* Align dst on a 16-byte boundary */
     while (w &&
            ((unsigned long)pd & 15))
@@ -736,18 +739,8 @@ core_combine_over_reverse_u_sse2 (uint32_t*       pd,
            pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-       /* fill cache line with next memory */
-       cache_prefetch_next ((__m128i*)ps);
-       cache_prefetch_next ((__m128i*)pd);
-       cache_prefetch_next ((__m128i*)pm);
-
        /* I'm loading unaligned because I'm not sure
         * about the address alignment.
         */
@@ -790,7 +783,7 @@ core_combine_over_reverse_u_sse2 (uint32_t*       pd,
 }
 
 static force_inline uint32_t
-core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
+core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
 {
     uint32_t maska = src >> 24;
 
@@ -800,54 +793,41 @@ core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
     }
     else if (maska != 0xff)
     {
-       return pack_1x64_32 (
-           pix_multiply_1x64 (unpack_32_1x64 (dst),
-                              expand_alpha_1x64 (unpack_32_1x64 (src))));
+       return pack_1x128_32 (
+           pix_multiply_1x128 (unpack_32_1x128 (dst),
+                               expand_alpha_1x128 (unpack_32_1x128 (src))));
     }
 
     return dst;
 }
 
-static force_inline void
-core_combine_in_u_sse2 (uint32_t*       pd,
-                        const uint32_t* ps,
-                        const uint32_t* pm,
-                        int             w)
+static void
+sse2_combine_in_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               pd,
+                   const uint32_t *         ps,
+                   const uint32_t *         pm,
+                   int                      w)
 {
     uint32_t s, d;
 
     __m128i xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst_lo, xmm_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
        s = combine1 (ps, pm);
        d = *pd;
 
-       *pd++ = core_combine_in_u_pixelsse2 (d, s);
+       *pd++ = core_combine_in_u_pixel_sse2 (d, s);
        w--;
        ps++;
        if (pm)
            pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-       /* fill cache line with next memory */
-       cache_prefetch_next ((__m128i*)ps);
-       cache_prefetch_next ((__m128i*)pd);
-       cache_prefetch_next ((__m128i*)pm);
-
        xmm_dst_hi = load_128_aligned ((__m128i*) pd);
        xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
 
@@ -874,7 +854,7 @@ core_combine_in_u_sse2 (uint32_t*       pd,
        s = combine1 (ps, pm);
        d = *pd;
 
-       *pd++ = core_combine_in_u_pixelsse2 (d, s);
+       *pd++ = core_combine_in_u_pixel_sse2 (d, s);
        w--;
        ps++;
        if (pm)
@@ -882,46 +862,33 @@ core_combine_in_u_sse2 (uint32_t*       pd,
     }
 }
 
-static force_inline void
-core_combine_reverse_in_u_sse2 (uint32_t*       pd,
-                                const uint32_t* ps,
-                                const uint32_t *pm,
-                                int             w)
+static void
+sse2_combine_in_reverse_u (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               pd,
+                           const uint32_t *         ps,
+                           const uint32_t *         pm,
+                           int                      w)
 {
     uint32_t s, d;
 
     __m128i xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst_lo, xmm_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
        s = combine1 (ps, pm);
        d = *pd;
 
-       *pd++ = core_combine_in_u_pixelsse2 (s, d);
+       *pd++ = core_combine_in_u_pixel_sse2 (s, d);
        ps++;
        w--;
        if (pm)
            pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-       /* fill cache line with next memory */
-       cache_prefetch_next ((__m128i*)ps);
-       cache_prefetch_next ((__m128i*)pd);
-       cache_prefetch_next ((__m128i*)pm);
-
        xmm_dst_hi = load_128_aligned ((__m128i*) pd);
        xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
 
@@ -948,7 +915,7 @@ core_combine_reverse_in_u_sse2 (uint32_t*       pd,
        s = combine1 (ps, pm);
        d = *pd;
 
-       *pd++ = core_combine_in_u_pixelsse2 (s, d);
+       *pd++ = core_combine_in_u_pixel_sse2 (s, d);
        w--;
        ps++;
        if (pm)
@@ -956,48 +923,35 @@ core_combine_reverse_in_u_sse2 (uint32_t*       pd,
     }
 }
 
-static force_inline void
-core_combine_reverse_out_u_sse2 (uint32_t*       pd,
-                                 const uint32_t* ps,
-                                 const uint32_t* pm,
-                                 int             w)
+static void
+sse2_combine_out_reverse_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               pd,
+                            const uint32_t *         ps,
+                            const uint32_t *         pm,
+                            int                      w)
 {
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
        uint32_t s = combine1 (ps, pm);
        uint32_t d = *pd;
 
-       *pd++ = pack_1x64_32 (
-           pix_multiply_1x64 (
-               unpack_32_1x64 (d), negate_1x64 (
-                   expand_alpha_1x64 (unpack_32_1x64 (s)))));
-       
+       *pd++ = pack_1x128_32 (
+           pix_multiply_1x128 (
+               unpack_32_1x128 (d), negate_1x128 (
+                   expand_alpha_1x128 (unpack_32_1x128 (s)))));
+
        if (pm)
            pm++;
        ps++;
        w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
        __m128i xmm_src_lo, xmm_src_hi;
        __m128i xmm_dst_lo, xmm_dst_hi;
 
-       /* fill cache line with next memory */
-       cache_prefetch_next ((__m128i*)ps);
-       cache_prefetch_next ((__m128i*)pd);
-       cache_prefetch_next ((__m128i*)pm);
-
        xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
        xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
@@ -1027,10 +981,10 @@ core_combine_reverse_out_u_sse2 (uint32_t*       pd,
        uint32_t s = combine1 (ps, pm);
        uint32_t d = *pd;
 
-       *pd++ = pack_1x64_32 (
-           pix_multiply_1x64 (
-               unpack_32_1x64 (d), negate_1x64 (
-                   expand_alpha_1x64 (unpack_32_1x64 (s)))));
+       *pd++ = pack_1x128_32 (
+           pix_multiply_1x128 (
+               unpack_32_1x128 (d), negate_1x128 (
+                   expand_alpha_1x128 (unpack_32_1x128 (s)))));
        ps++;
        if (pm)
            pm++;
@@ -1038,47 +992,34 @@ core_combine_reverse_out_u_sse2 (uint32_t*       pd,
     }
 }
 
-static force_inline void
-core_combine_out_u_sse2 (uint32_t*       pd,
-                         const uint32_t* ps,
-                         const uint32_t* pm,
-                         int             w)
+static void
+sse2_combine_out_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               pd,
+                    const uint32_t *         ps,
+                    const uint32_t *         pm,
+                    int                      w)
 {
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
        uint32_t s = combine1 (ps, pm);
        uint32_t d = *pd;
 
-       *pd++ = pack_1x64_32 (
-           pix_multiply_1x64 (
-               unpack_32_1x64 (s), negate_1x64 (
-                   expand_alpha_1x64 (unpack_32_1x64 (d)))));
+       *pd++ = pack_1x128_32 (
+           pix_multiply_1x128 (
+               unpack_32_1x128 (s), negate_1x128 (
+                   expand_alpha_1x128 (unpack_32_1x128 (d)))));
        w--;
        ps++;
        if (pm)
            pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
        __m128i xmm_src_lo, xmm_src_hi;
        __m128i xmm_dst_lo, xmm_dst_hi;
 
-       /* fill cache line with next memory */
-       cache_prefetch_next ((__m128i*)ps);
-       cache_prefetch_next ((__m128i*)pd);
-       cache_prefetch_next ((__m128i*)pm);
-
        xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
        xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
@@ -1107,10 +1048,10 @@ core_combine_out_u_sse2 (uint32_t*       pd,
        uint32_t s = combine1 (ps, pm);
        uint32_t d = *pd;
 
-       *pd++ = pack_1x64_32 (
-           pix_multiply_1x64 (
-               unpack_32_1x64 (s), negate_1x64 (
-                   expand_alpha_1x64 (unpack_32_1x64 (d)))));
+       *pd++ = pack_1x128_32 (
+           pix_multiply_1x128 (
+               unpack_32_1x128 (s), negate_1x128 (
+                   expand_alpha_1x128 (unpack_32_1x128 (d)))));
        w--;
        ps++;
        if (pm)
@@ -1122,20 +1063,22 @@ static force_inline uint32_t
 core_combine_atop_u_pixel_sse2 (uint32_t src,
                                 uint32_t dst)
 {
-    __m64 s = unpack_32_1x64 (src);
-    __m64 d = unpack_32_1x64 (dst);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
 
-    __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
-    __m64 da = expand_alpha_1x64 (d);
+    __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
+    __m128i da = expand_alpha_1x128 (d);
 
-    return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
+    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
 }
 
-static force_inline void
-core_combine_atop_u_sse2 (uint32_t*       pd,
-                          const uint32_t* ps,
-                          const uint32_t* pm,
-                          int             w)
+static void
+sse2_combine_atop_u (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
 {
     uint32_t s, d;
 
@@ -1144,11 +1087,6 @@ core_combine_atop_u_sse2 (uint32_t*       pd,
     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
        s = combine1 (ps, pm);
@@ -1161,18 +1099,8 @@ core_combine_atop_u_sse2 (uint32_t*       pd,
            pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-       /* fill cache line with next memory */
-       cache_prefetch_next ((__m128i*)ps);
-       cache_prefetch_next ((__m128i*)pd);
-       cache_prefetch_next ((__m128i*)pm);
-
        xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
        xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
@@ -1219,20 +1147,22 @@ static force_inline uint32_t
 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
                                         uint32_t dst)
 {
-    __m64 s = unpack_32_1x64 (src);
-    __m64 d = unpack_32_1x64 (dst);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
 
-    __m64 sa = expand_alpha_1x64 (s);
-    __m64 da = negate_1x64 (expand_alpha_1x64 (d));
+    __m128i sa = expand_alpha_1x128 (s);
+    __m128i da = negate_1x128 (expand_alpha_1x128 (d));
 
-    return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
+    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
 }
 
-static force_inline void
-core_combine_reverse_atop_u_sse2 (uint32_t*       pd,
-                                  const uint32_t* ps,
-                                  const uint32_t* pm,
-                                  int             w)
+static void
+sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               pd,
+                             const uint32_t *         ps,
+                             const uint32_t *         pm,
+                             int                      w)
 {
     uint32_t s, d;
 
@@ -1241,11 +1171,6 @@ core_combine_reverse_atop_u_sse2 (uint32_t*       pd,
     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
        s = combine1 (ps, pm);
@@ -1258,18 +1183,8 @@ core_combine_reverse_atop_u_sse2 (uint32_t*       pd,
            pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-       /* fill cache line with next memory */
-       cache_prefetch_next ((__m128i*)ps);
-       cache_prefetch_next ((__m128i*)pd);
-       cache_prefetch_next ((__m128i*)pm);
-
        xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
        xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
@@ -1316,20 +1231,22 @@ static force_inline uint32_t
 core_combine_xor_u_pixel_sse2 (uint32_t src,
                                uint32_t dst)
 {
-    __m64 s = unpack_32_1x64 (src);
-    __m64 d = unpack_32_1x64 (dst);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
 
-    __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
-    __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
+    __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
+    __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
 
-    return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
+    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
 }
 
-static force_inline void
-core_combine_xor_u_sse2 (uint32_t*       dst,
-                         const uint32_t* src,
-                         const uint32_t *mask,
-                         int             width)
+static void
+sse2_combine_xor_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dst,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
 {
     int w = width;
     uint32_t s, d;
@@ -1342,11 +1259,6 @@ core_combine_xor_u_sse2 (uint32_t*       dst,
     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && ((unsigned long) pd & 15))
     {
        s = combine1 (ps, pm);
@@ -1359,18 +1271,8 @@ core_combine_xor_u_sse2 (uint32_t*       dst,
            pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-       /* fill cache line with next memory */
-       cache_prefetch_next ((__m128i*)ps);
-       cache_prefetch_next ((__m128i*)pd);
-       cache_prefetch_next ((__m128i*)pm);
-
        xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
        xmm_dst = load_128_aligned ((__m128i*) pd);
 
@@ -1416,10 +1318,12 @@ core_combine_xor_u_sse2 (uint32_t*       dst,
 }
 
 static force_inline void
-core_combine_add_u_sse2 (uint32_t*       dst,
-                         const uint32_t* src,
-                         const uint32_t* mask,
-                         int             width)
+sse2_combine_add_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dst,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
 {
     int w = width;
     uint32_t s, d;
@@ -1427,11 +1331,6 @@ core_combine_add_u_sse2 (uint32_t*       dst,
     const uint32_t* ps = src;
     const uint32_t* pm = mask;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
        s = combine1 (ps, pm);
@@ -1440,25 +1339,15 @@ core_combine_add_u_sse2 (uint32_t*       dst,
        ps++;
        if (pm)
            pm++;
-       *pd++ = _mm_cvtsi64_si32 (
-           _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
+       *pd++ = _mm_cvtsi128_si32 (
+           _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
        w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
        __m128i s;
 
-       /* fill cache line with next memory */
-       cache_prefetch_next ((__m128i*)ps);
-       cache_prefetch_next ((__m128i*)pd);
-       cache_prefetch_next ((__m128i*)pm);
-
        s = combine4 ((__m128i*)ps, (__m128i*)pm);
 
        save_128_aligned (
@@ -1477,8 +1366,8 @@ core_combine_add_u_sse2 (uint32_t*       dst,
        d = *pd;
 
        ps++;
-       *pd++ = _mm_cvtsi64_si32 (
-           _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
+       *pd++ = _mm_cvtsi128_si32 (
+           _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
        if (pm)
            pm++;
     }
@@ -1488,36 +1377,33 @@ static force_inline uint32_t
 core_combine_saturate_u_pixel_sse2 (uint32_t src,
                                     uint32_t dst)
 {
-    __m64 ms = unpack_32_1x64 (src);
-    __m64 md = unpack_32_1x64 (dst);
+    __m128i ms = unpack_32_1x128 (src);
+    __m128i md = unpack_32_1x128 (dst);
     uint32_t sa = src >> 24;
     uint32_t da = ~dst >> 24;
 
     if (sa > da)
     {
-       ms = pix_multiply_1x64 (
-           ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
+       ms = pix_multiply_1x128 (
+           ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
     }
 
-    return pack_1x64_32 (_mm_adds_pu16 (md, ms));
+    return pack_1x128_32 (_mm_adds_epu16 (md, ms));
 }
 
-static force_inline void
-core_combine_saturate_u_sse2 (uint32_t *      pd,
-                              const uint32_t *ps,
-                              const uint32_t *pm,
-                              int             w)
+static void
+sse2_combine_saturate_u (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         uint32_t *               pd,
+                         const uint32_t *         ps,
+                         const uint32_t *         pm,
+                         int                      w)
 {
     uint32_t s, d;
 
     uint32_t pack_cmp;
     __m128i xmm_src, xmm_dst;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
        s = combine1 (ps, pm);
@@ -1530,18 +1416,8 @@ core_combine_saturate_u_sse2 (uint32_t *      pd,
            pm++;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-       /* fill cache line with next memory */
-       cache_prefetch_next ((__m128i*)ps);
-       cache_prefetch_next ((__m128i*)pd);
-       cache_prefetch_next ((__m128i*)pm);
-
        xmm_dst = load_128_aligned  ((__m128i*)pd);
        xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
 
@@ -1602,11 +1478,13 @@ core_combine_saturate_u_sse2 (uint32_t *      pd,
     }
 }
 
-static force_inline void
-core_combine_src_ca_sse2 (uint32_t*       pd,
-                          const uint32_t* ps,
-                          const uint32_t *pm,
-                          int             w)
+static void
+sse2_combine_src_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
 {
     uint32_t s, m;
 
@@ -1614,32 +1492,17 @@ core_combine_src_ca_sse2 (uint32_t*       pd,
     __m128i xmm_mask_lo, xmm_mask_hi;
     __m128i xmm_dst_lo, xmm_dst_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
        s = *ps++;
        m = *pm++;
-       *pd++ = pack_1x64_32 (
-           pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
+       *pd++ = pack_1x128_32 (
+           pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
        w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-       /* fill cache line with next memory */
-       cache_prefetch_next ((__m128i*)ps);
-       cache_prefetch_next ((__m128i*)pd);
-       cache_prefetch_next ((__m128i*)pm);
-
        xmm_src_hi = load_128_unaligned ((__m128i*)ps);
        xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
 
@@ -1663,8 +1526,8 @@ core_combine_src_ca_sse2 (uint32_t*       pd,
     {
        s = *ps++;
        m = *pm++;
-       *pd++ = pack_1x64_32 (
-           pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
+       *pd++ = pack_1x128_32 (
+           pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
        w--;
     }
 }
@@ -1674,19 +1537,21 @@ core_combine_over_ca_pixel_sse2 (uint32_t src,
                                  uint32_t mask,
                                  uint32_t dst)
 {
-    __m64 s = unpack_32_1x64 (src);
-    __m64 expAlpha = expand_alpha_1x64 (s);
-    __m64 unpk_mask = unpack_32_1x64 (mask);
-    __m64 unpk_dst  = unpack_32_1x64 (dst);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i expAlpha = expand_alpha_1x128 (s);
+    __m128i unpk_mask = unpack_32_1x128 (mask);
+    __m128i unpk_dst  = unpack_32_1x128 (dst);
 
-    return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
+    return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
 }
 
-static force_inline void
-core_combine_over_ca_sse2 (uint32_t*       pd,
-                           const uint32_t* ps,
-                           const uint32_t *pm,
-                           int             w)
+static void
+sse2_combine_over_ca (pixman_implementation_t *imp,
+                      pixman_op_t              op,
+                      uint32_t *               pd,
+                      const uint32_t *         ps,
+                      const uint32_t *         pm,
+                      int                      w)
 {
     uint32_t s, m, d;
 
@@ -1695,11 +1560,6 @@ core_combine_over_ca_sse2 (uint32_t*       pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
        s = *ps++;
@@ -1710,18 +1570,8 @@ core_combine_over_ca_sse2 (uint32_t*       pd,
        w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-       /* fill cache line with next memory */
-       cache_prefetch_next ((__m128i*)ps);
-       cache_prefetch_next ((__m128i*)pd);
-       cache_prefetch_next ((__m128i*)pm);
-
        xmm_dst_hi = load_128_aligned ((__m128i*)pd);
        xmm_src_hi = load_128_unaligned ((__m128i*)ps);
        xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -1763,19 +1613,21 @@ core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
                                          uint32_t mask,
                                          uint32_t dst)
 {
-    __m64 d = unpack_32_1x64 (dst);
+    __m128i d = unpack_32_1x128 (dst);
 
-    return pack_1x64_32 (
-       over_1x64 (d, expand_alpha_1x64 (d),
-                  pix_multiply_1x64 (unpack_32_1x64 (src),
-                                     unpack_32_1x64 (mask))));
+    return pack_1x128_32 (
+       over_1x128 (d, expand_alpha_1x128 (d),
+                   pix_multiply_1x128 (unpack_32_1x128 (src),
+                                       unpack_32_1x128 (mask))));
 }
 
-static force_inline void
-core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
-                                   const uint32_t* ps,
-                                   const uint32_t *pm,
-                                   int             w)
+static void
+sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              uint32_t *               pd,
+                              const uint32_t *         ps,
+                              const uint32_t *         pm,
+                              int                      w)
 {
     uint32_t s, m, d;
 
@@ -1784,11 +1636,6 @@ core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
        s = *ps++;
@@ -1799,18 +1646,8 @@ core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
        w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-       /* fill cache line with next memory */
-       cache_prefetch_next ((__m128i*)ps);
-       cache_prefetch_next ((__m128i*)pd);
-       cache_prefetch_next ((__m128i*)pm);
-
        xmm_dst_hi = load_128_aligned ((__m128i*)pd);
        xmm_src_hi = load_128_unaligned ((__m128i*)ps);
        xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -1849,11 +1686,13 @@ core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
     }
 }
 
-static force_inline void
-core_combine_in_ca_sse2 (uint32_t *      pd,
-                         const uint32_t *ps,
-                         const uint32_t *pm,
-                         int             w)
+static void
+sse2_combine_in_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               pd,
+                    const uint32_t *         ps,
+                    const uint32_t *         pm,
+                    int                      w)
 {
     uint32_t s, m, d;
 
@@ -1862,37 +1701,22 @@ core_combine_in_ca_sse2 (uint32_t *      pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
        s = *ps++;
        m = *pm++;
        d = *pd;
 
-       *pd++ = pack_1x64_32 (
-           pix_multiply_1x64 (
-               pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
-               expand_alpha_1x64 (unpack_32_1x64 (d))));
+       *pd++ = pack_1x128_32 (
+           pix_multiply_1x128 (
+               pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
+               expand_alpha_1x128 (unpack_32_1x128 (d))));
 
        w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-       /* fill cache line with next memory */
-       cache_prefetch_next ((__m128i*)ps);
-       cache_prefetch_next ((__m128i*)pd);
-       cache_prefetch_next ((__m128i*)pm);
-
        xmm_dst_hi = load_128_aligned ((__m128i*)pd);
        xmm_src_hi = load_128_unaligned ((__m128i*)ps);
        xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -1927,21 +1751,23 @@ core_combine_in_ca_sse2 (uint32_t *      pd,
        m = *pm++;
        d = *pd;
 
-       *pd++ = pack_1x64_32 (
-           pix_multiply_1x64 (
-               pix_multiply_1x64 (
-                   unpack_32_1x64 (s), unpack_32_1x64 (m)),
-               expand_alpha_1x64 (unpack_32_1x64 (d))));
+       *pd++ = pack_1x128_32 (
+           pix_multiply_1x128 (
+               pix_multiply_1x128 (
+                   unpack_32_1x128 (s), unpack_32_1x128 (m)),
+               expand_alpha_1x128 (unpack_32_1x128 (d))));
 
        w--;
     }
 }
 
-static force_inline void
-core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
-                                 const uint32_t *ps,
-                                 const uint32_t *pm,
-                                 int             w)
+static void
+sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               pd,
+                            const uint32_t *         ps,
+                            const uint32_t *         pm,
+                            int                      w)
 {
     uint32_t s, m, d;
 
@@ -1950,37 +1776,22 @@ core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
        s = *ps++;
        m = *pm++;
        d = *pd;
 
-       *pd++ = pack_1x64_32 (
-           pix_multiply_1x64 (
-               unpack_32_1x64 (d),
-               pix_multiply_1x64 (unpack_32_1x64 (m),
-                                  expand_alpha_1x64 (unpack_32_1x64 (s)))));
+       *pd++ = pack_1x128_32 (
+           pix_multiply_1x128 (
+               unpack_32_1x128 (d),
+               pix_multiply_1x128 (unpack_32_1x128 (m),
+                                  expand_alpha_1x128 (unpack_32_1x128 (s)))));
        w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-       /* fill cache line with next memory */
-       cache_prefetch_next ((__m128i*)ps);
-       cache_prefetch_next ((__m128i*)pd);
-       cache_prefetch_next ((__m128i*)pm);
-
        xmm_dst_hi = load_128_aligned ((__m128i*)pd);
        xmm_src_hi = load_128_unaligned ((__m128i*)ps);
        xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2014,20 +1825,22 @@ core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
        m = *pm++;
        d = *pd;
 
-       *pd++ = pack_1x64_32 (
-           pix_multiply_1x64 (
-               unpack_32_1x64 (d),
-               pix_multiply_1x64 (unpack_32_1x64 (m),
-                                  expand_alpha_1x64 (unpack_32_1x64 (s)))));
+       *pd++ = pack_1x128_32 (
+           pix_multiply_1x128 (
+               unpack_32_1x128 (d),
+               pix_multiply_1x128 (unpack_32_1x128 (m),
+                                  expand_alpha_1x128 (unpack_32_1x128 (s)))));
        w--;
     }
 }
 
-static force_inline void
-core_combine_out_ca_sse2 (uint32_t *      pd,
-                          const uint32_t *ps,
-                          const uint32_t *pm,
-                          int             w)
+static void
+sse2_combine_out_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
 {
     uint32_t s, m, d;
 
@@ -2036,37 +1849,22 @@ core_combine_out_ca_sse2 (uint32_t *      pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
        s = *ps++;
        m = *pm++;
        d = *pd;
 
-       *pd++ = pack_1x64_32 (
-           pix_multiply_1x64 (
-               pix_multiply_1x64 (
-                   unpack_32_1x64 (s), unpack_32_1x64 (m)),
-               negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
+       *pd++ = pack_1x128_32 (
+           pix_multiply_1x128 (
+               pix_multiply_1x128 (
+                   unpack_32_1x128 (s), unpack_32_1x128 (m)),
+               negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
        w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-       /* fill cache line with next memory */
-       cache_prefetch_next ((__m128i*)ps);
-       cache_prefetch_next ((__m128i*)pd);
-       cache_prefetch_next ((__m128i*)pm);
-
        xmm_dst_hi = load_128_aligned ((__m128i*)pd);
        xmm_src_hi = load_128_unaligned ((__m128i*)ps);
        xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2102,21 +1900,23 @@ core_combine_out_ca_sse2 (uint32_t *      pd,
        m = *pm++;
        d = *pd;
 
-       *pd++ = pack_1x64_32 (
-           pix_multiply_1x64 (
-               pix_multiply_1x64 (
-                   unpack_32_1x64 (s), unpack_32_1x64 (m)),
-               negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
+       *pd++ = pack_1x128_32 (
+           pix_multiply_1x128 (
+               pix_multiply_1x128 (
+                   unpack_32_1x128 (s), unpack_32_1x128 (m)),
+               negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
 
        w--;
     }
 }
 
-static force_inline void
-core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
-                                  const uint32_t *ps,
-                                  const uint32_t *pm,
-                                  int             w)
+static void
+sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               pd,
+                             const uint32_t *         ps,
+                             const uint32_t *         pm,
+                             int                      w)
 {
     uint32_t s, m, d;
 
@@ -2125,38 +1925,23 @@ core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
        s = *ps++;
        m = *pm++;
        d = *pd;
 
-       *pd++ = pack_1x64_32 (
-           pix_multiply_1x64 (
-               unpack_32_1x64 (d),
-               negate_1x64 (pix_multiply_1x64 (
-                                unpack_32_1x64 (m),
-                                expand_alpha_1x64 (unpack_32_1x64 (s))))));
+       *pd++ = pack_1x128_32 (
+           pix_multiply_1x128 (
+               unpack_32_1x128 (d),
+               negate_1x128 (pix_multiply_1x128 (
+                                unpack_32_1x128 (m),
+                                expand_alpha_1x128 (unpack_32_1x128 (s))))));
        w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-       /* fill cache line with next memory */
-       cache_prefetch_next ((__m128i*)ps);
-       cache_prefetch_next ((__m128i*)pd);
-       cache_prefetch_next ((__m128i*)pm);
-
        xmm_dst_hi = load_128_aligned ((__m128i*)pd);
        xmm_src_hi = load_128_unaligned ((__m128i*)ps);
        xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2194,12 +1979,12 @@ core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
        m = *pm++;
        d = *pd;
 
-       *pd++ = pack_1x64_32 (
-           pix_multiply_1x64 (
-               unpack_32_1x64 (d),
-               negate_1x64 (pix_multiply_1x64 (
-                                unpack_32_1x64 (m),
-                                expand_alpha_1x64 (unpack_32_1x64 (s))))));
+       *pd++ = pack_1x128_32 (
+           pix_multiply_1x128 (
+               unpack_32_1x128 (d),
+               negate_1x128 (pix_multiply_1x128 (
+                                unpack_32_1x128 (m),
+                                expand_alpha_1x128 (unpack_32_1x128 (s))))));
        w--;
     }
 }
@@ -2209,23 +1994,25 @@ core_combine_atop_ca_pixel_sse2 (uint32_t src,
                                  uint32_t mask,
                                  uint32_t dst)
 {
-    __m64 m = unpack_32_1x64 (mask);
-    __m64 s = unpack_32_1x64 (src);
-    __m64 d = unpack_32_1x64 (dst);
-    __m64 sa = expand_alpha_1x64 (s);
-    __m64 da = expand_alpha_1x64 (d);
+    __m128i m = unpack_32_1x128 (mask);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+    __m128i sa = expand_alpha_1x128 (s);
+    __m128i da = expand_alpha_1x128 (d);
 
-    s = pix_multiply_1x64 (s, m);
-    m = negate_1x64 (pix_multiply_1x64 (m, sa));
+    s = pix_multiply_1x128 (s, m);
+    m = negate_1x128 (pix_multiply_1x128 (m, sa));
 
-    return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
+    return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
 }
 
-static force_inline void
-core_combine_atop_ca_sse2 (uint32_t *      pd,
-                           const uint32_t *ps,
-                           const uint32_t *pm,
-                           int             w)
+static void
+sse2_combine_atop_ca (pixman_implementation_t *imp,
+                      pixman_op_t              op,
+                      uint32_t *               pd,
+                      const uint32_t *         ps,
+                      const uint32_t *         pm,
+                      int                      w)
 {
     uint32_t s, m, d;
 
@@ -2235,11 +2022,6 @@ core_combine_atop_ca_sse2 (uint32_t *      pd,
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
        s = *ps++;
@@ -2250,18 +2032,8 @@ core_combine_atop_ca_sse2 (uint32_t *      pd,
        w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-       /* fill cache line with next memory */
-       cache_prefetch_next ((__m128i*)ps);
-       cache_prefetch_next ((__m128i*)pd);
-       cache_prefetch_next ((__m128i*)pm);
-
        xmm_dst_hi = load_128_aligned ((__m128i*)pd);
        xmm_src_hi = load_128_unaligned ((__m128i*)ps);
        xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2314,24 +2086,26 @@ core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
                                          uint32_t mask,
                                          uint32_t dst)
 {
-    __m64 m = unpack_32_1x64 (mask);
-    __m64 s = unpack_32_1x64 (src);
-    __m64 d = unpack_32_1x64 (dst);
+    __m128i m = unpack_32_1x128 (mask);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
 
-    __m64 da = negate_1x64 (expand_alpha_1x64 (d));
-    __m64 sa = expand_alpha_1x64 (s);
+    __m128i da = negate_1x128 (expand_alpha_1x128 (d));
+    __m128i sa = expand_alpha_1x128 (s);
 
-    s = pix_multiply_1x64 (s, m);
-    m = pix_multiply_1x64 (m, sa);
+    s = pix_multiply_1x128 (s, m);
+    m = pix_multiply_1x128 (m, sa);
 
-    return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
+    return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
 }
 
-static force_inline void
-core_combine_reverse_atop_ca_sse2 (uint32_t *      pd,
-                                   const uint32_t *ps,
-                                   const uint32_t *pm,
-                                   int             w)
+static void
+sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              uint32_t *               pd,
+                              const uint32_t *         ps,
+                              const uint32_t *         pm,
+                              int                      w)
 {
     uint32_t s, m, d;
 
@@ -2341,11 +2115,6 @@ core_combine_reverse_atop_ca_sse2 (uint32_t *      pd,
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
        s = *ps++;
@@ -2356,18 +2125,8 @@ core_combine_reverse_atop_ca_sse2 (uint32_t *      pd,
        w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-       /* fill cache line with next memory */
-       cache_prefetch_next ((__m128i*)ps);
-       cache_prefetch_next ((__m128i*)pd);
-       cache_prefetch_next ((__m128i*)pm);
-
        xmm_dst_hi = load_128_aligned ((__m128i*)pd);
        xmm_src_hi = load_128_unaligned ((__m128i*)ps);
        xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2421,26 +2180,28 @@ core_combine_xor_ca_pixel_sse2 (uint32_t src,
                                 uint32_t mask,
                                 uint32_t dst)
 {
-    __m64 a = unpack_32_1x64 (mask);
-    __m64 s = unpack_32_1x64 (src);
-    __m64 d = unpack_32_1x64 (dst);
+    __m128i a = unpack_32_1x128 (mask);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
 
-    __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
-                                      a, expand_alpha_1x64 (s)));
-    __m64 dest      = pix_multiply_1x64 (s, a);
-    __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
+    __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
+                                      a, expand_alpha_1x128 (s)));
+    __m128i dest      = pix_multiply_1x128 (s, a);
+    __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
 
-    return pack_1x64_32 (pix_add_multiply_1x64 (&d,
+    return pack_1x128_32 (pix_add_multiply_1x128 (&d,
                                                 &alpha_dst,
                                                 &dest,
                                                 &alpha_src));
 }
 
-static force_inline void
-core_combine_xor_ca_sse2 (uint32_t *      pd,
-                          const uint32_t *ps,
-                          const uint32_t *pm,
-                          int             w)
+static void
+sse2_combine_xor_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
 {
     uint32_t s, m, d;
 
@@ -2450,11 +2211,6 @@ core_combine_xor_ca_sse2 (uint32_t *      pd,
     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
        s = *ps++;
@@ -2465,18 +2221,8 @@ core_combine_xor_ca_sse2 (uint32_t *      pd,
        w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-       /* fill cache line with next memory */
-       cache_prefetch_next ((__m128i*)ps);
-       cache_prefetch_next ((__m128i*)pd);
-       cache_prefetch_next ((__m128i*)pm);
-
        xmm_dst_hi = load_128_aligned ((__m128i*)pd);
        xmm_src_hi = load_128_unaligned ((__m128i*)ps);
        xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2527,11 +2273,13 @@ core_combine_xor_ca_sse2 (uint32_t *      pd,
     }
 }
 
-static force_inline void
-core_combine_add_ca_sse2 (uint32_t *      pd,
-                          const uint32_t *ps,
-                          const uint32_t *pm,
-                          int             w)
+static void
+sse2_combine_add_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
 {
     uint32_t s, m, d;
 
@@ -2539,36 +2287,21 @@ core_combine_add_ca_sse2 (uint32_t *      pd,
     __m128i xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask_lo, xmm_mask_hi;
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w && (unsigned long)pd & 15)
     {
        s = *ps++;
        m = *pm++;
        d = *pd;
 
-       *pd++ = pack_1x64_32 (
-           _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
-                                            unpack_32_1x64 (m)),
-                         unpack_32_1x64 (d)));
+       *pd++ = pack_1x128_32 (
+           _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
+                                              unpack_32_1x128 (m)),
+                          unpack_32_1x128 (d)));
        w--;
     }
 
-    /* call prefetch hint to optimize cache load*/
-    cache_prefetch ((__m128i*)ps);
-    cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
-
     while (w >= 4)
     {
-       /* fill cache line with next memory */
-       cache_prefetch_next ((__m128i*)ps);
-       cache_prefetch_next ((__m128i*)pd);
-       cache_prefetch_next ((__m128i*)pm);
-
        xmm_src_hi = load_128_unaligned ((__m128i*)ps);
        xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
        xmm_dst_hi = load_128_aligned ((__m128i*)pd);
@@ -2598,342 +2331,52 @@ core_combine_add_ca_sse2 (uint32_t *      pd,
        m = *pm++;
        d = *pd;
 
-       *pd++ = pack_1x64_32 (
-           _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
-                                            unpack_32_1x64 (m)),
-                         unpack_32_1x64 (d)));
+       *pd++ = pack_1x128_32 (
+           _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
+                                              unpack_32_1x128 (m)),
+                          unpack_32_1x128 (d)));
        w--;
     }
 }
 
-/* ---------------------------------------------------
- * fb_compose_setup_sSE2
- */
-static force_inline __m64
-create_mask_16_64 (uint16_t mask)
-{
-    return _mm_set1_pi16 (mask);
-}
-
 static force_inline __m128i
 create_mask_16_128 (uint16_t mask)
 {
     return _mm_set1_epi16 (mask);
 }
 
-static force_inline __m64
-create_mask_2x32_64 (uint32_t mask0,
-                     uint32_t mask1)
-{
-    return _mm_set_pi32 (mask0, mask1);
-}
-
+/* Work around a code generation bug in Sun Studio 12. */
+#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
+# define create_mask_2x32_128(mask0, mask1)                            \
+    (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
+#else
 static force_inline __m128i
 create_mask_2x32_128 (uint32_t mask0,
                       uint32_t mask1)
 {
     return _mm_set_epi32 (mask0, mask1, mask0, mask1);
 }
-
-/* SSE2 code patch for fbcompose.c */
-
-static void
-sse2_combine_over_u (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     uint32_t *               dst,
-                     const uint32_t *         src,
-                     const uint32_t *         mask,
-                     int                      width)
-{
-    core_combine_over_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_over_reverse_u (pixman_implementation_t *imp,
-                             pixman_op_t              op,
-                             uint32_t *               dst,
-                             const uint32_t *         src,
-                             const uint32_t *         mask,
-                             int                      width)
-{
-    core_combine_over_reverse_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_in_u (pixman_implementation_t *imp,
-                   pixman_op_t              op,
-                   uint32_t *               dst,
-                   const uint32_t *         src,
-                   const uint32_t *         mask,
-                   int                      width)
-{
-    core_combine_in_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_in_reverse_u (pixman_implementation_t *imp,
-                           pixman_op_t              op,
-                           uint32_t *               dst,
-                           const uint32_t *         src,
-                           const uint32_t *         mask,
-                           int                      width)
-{
-    core_combine_reverse_in_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_out_u (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    uint32_t *               dst,
-                    const uint32_t *         src,
-                    const uint32_t *         mask,
-                    int                      width)
-{
-    core_combine_out_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_out_reverse_u (pixman_implementation_t *imp,
-                            pixman_op_t              op,
-                            uint32_t *               dst,
-                            const uint32_t *         src,
-                            const uint32_t *         mask,
-                            int                      width)
-{
-    core_combine_reverse_out_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_atop_u (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     uint32_t *               dst,
-                     const uint32_t *         src,
-                     const uint32_t *         mask,
-                     int                      width)
-{
-    core_combine_atop_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
-                             pixman_op_t              op,
-                             uint32_t *               dst,
-                             const uint32_t *         src,
-                             const uint32_t *         mask,
-                             int                      width)
-{
-    core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_xor_u (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    uint32_t *               dst,
-                    const uint32_t *         src,
-                    const uint32_t *         mask,
-                    int                      width)
-{
-    core_combine_xor_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_add_u (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    uint32_t *               dst,
-                    const uint32_t *         src,
-                    const uint32_t *         mask,
-                    int                      width)
-{
-    core_combine_add_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_saturate_u (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         uint32_t *               dst,
-                         const uint32_t *         src,
-                         const uint32_t *         mask,
-                         int                      width)
-{
-    core_combine_saturate_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_src_ca (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     uint32_t *               dst,
-                     const uint32_t *         src,
-                     const uint32_t *         mask,
-                     int                      width)
-{
-    core_combine_src_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_over_ca (pixman_implementation_t *imp,
-                      pixman_op_t              op,
-                      uint32_t *               dst,
-                      const uint32_t *         src,
-                      const uint32_t *         mask,
-                      int                      width)
-{
-    core_combine_over_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              uint32_t *               dst,
-                              const uint32_t *         src,
-                              const uint32_t *         mask,
-                              int                      width)
-{
-    core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_in_ca (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    uint32_t *               dst,
-                    const uint32_t *         src,
-                    const uint32_t *         mask,
-                    int                      width)
-{
-    core_combine_in_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
-                            pixman_op_t              op,
-                            uint32_t *               dst,
-                            const uint32_t *         src,
-                            const uint32_t *         mask,
-                            int                      width)
-{
-    core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_out_ca (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     uint32_t *               dst,
-                     const uint32_t *         src,
-                     const uint32_t *         mask,
-                     int                      width)
-{
-    core_combine_out_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
-                             pixman_op_t              op,
-                             uint32_t *               dst,
-                             const uint32_t *         src,
-                             const uint32_t *         mask,
-                             int                      width)
-{
-    core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_atop_ca (pixman_implementation_t *imp,
-                      pixman_op_t              op,
-                      uint32_t *               dst,
-                      const uint32_t *         src,
-                      const uint32_t *         mask,
-                      int                      width)
-{
-    core_combine_atop_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              uint32_t *               dst,
-                              const uint32_t *         src,
-                              const uint32_t *         mask,
-                              int                      width)
-{
-    core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_xor_ca (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     uint32_t *               dst,
-                     const uint32_t *         src,
-                     const uint32_t *         mask,
-                     int                      width)
-{
-    core_combine_xor_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_add_ca (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     uint32_t *               dst,
-                     const uint32_t *         src,
-                     const uint32_t *         mask,
-                     int                      width)
-{
-    core_combine_add_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-/* -------------------------------------------------------------------
- * composite_over_n_8888
- */
+#endif
 
 static void
 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
-                            pixman_op_t              op,
-                            pixman_image_t *         src_image,
-                            pixman_image_t *         mask_image,
-                            pixman_image_t *         dst_image,
-                            int32_t                  src_x,
-                            int32_t                  src_y,
-                            int32_t                  mask_x,
-                            int32_t                  mask_y,
-                            int32_t                  dest_x,
-                            int32_t                  dest_y,
-                            int32_t                  width,
-                            int32_t                  height)
+                            pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t src;
     uint32_t    *dst_line, *dst, d;
-    uint16_t w;
+    int32_t w;
     int dst_stride;
     __m128i xmm_src, xmm_alpha;
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
 
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     if (src == 0)
        return;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 
     xmm_src = expand_pixel_32_1x128 (src);
     xmm_alpha = expand_alpha_1x128 (xmm_src);
@@ -2942,28 +2385,20 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp,
     {
        dst = dst_line;
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)dst);
-
        dst_line += dst_stride;
        w = width;
 
        while (w && (unsigned long)dst & 15)
        {
            d = *dst;
-           *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
-                                             _mm_movepi64_pi64 (xmm_alpha),
-                                             unpack_32_1x64 (d)));
+           *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
+                                               xmm_alpha,
+                                               unpack_32_1x128 (d)));
            w--;
        }
 
-       cache_prefetch ((__m128i*)dst);
-
        while (w >= 4)
        {
-           /* fill cache line with next memory */
-           cache_prefetch_next ((__m128i*)dst);
-
            xmm_dst = load_128_aligned ((__m128i*)dst);
 
            unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
@@ -2983,48 +2418,34 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp,
        while (w)
        {
            d = *dst;
-           *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
-                                             _mm_movepi64_pi64 (xmm_alpha),
-                                             unpack_32_1x64 (d)));
+           *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
+                                               xmm_alpha,
+                                               unpack_32_1x128 (d)));
            w--;
        }
 
     }
-    _mm_empty ();
 }
 
-/* ---------------------------------------------------------------------
- * composite_over_n_0565
- */
 static void
 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
-                            pixman_op_t              op,
-                            pixman_image_t *         src_image,
-                            pixman_image_t *         mask_image,
-                            pixman_image_t *         dst_image,
-                            int32_t                  src_x,
-                            int32_t                  src_y,
-                            int32_t                  mask_x,
-                            int32_t                  mask_y,
-                            int32_t                  dest_x,
-                            int32_t                  dest_y,
-                            int32_t                  width,
-                            int32_t                  height)
+                            pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t src;
     uint16_t    *dst_line, *dst, d;
-    uint16_t w;
+    int32_t w;
     int dst_stride;
     __m128i xmm_src, xmm_alpha;
     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
 
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     if (src == 0)
        return;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
 
     xmm_src = expand_pixel_32_1x128 (src);
     xmm_alpha = expand_alpha_1x128 (xmm_src);
@@ -3033,9 +2454,6 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp,
     {
        dst = dst_line;
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)dst);
-
        dst_line += dst_stride;
        w = width;
 
@@ -3044,20 +2462,14 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp,
            d = *dst;
 
            *dst++ = pack_565_32_16 (
-               pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
-                                        _mm_movepi64_pi64 (xmm_alpha),
-                                        expand565_16_1x64 (d))));
+               pack_1x128_32 (over_1x128 (xmm_src,
+                                          xmm_alpha,
+                                          expand565_16_1x128 (d))));
            w--;
        }
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)dst);
-
        while (w >= 8)
        {
-           /* fill cache line with next memory */
-           cache_prefetch_next ((__m128i*)dst);
-
            xmm_dst = load_128_aligned ((__m128i*)dst);
 
            unpack_565_128_4x128 (xmm_dst,
@@ -3083,61 +2495,43 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp,
        {
            d = *dst;
            *dst++ = pack_565_32_16 (
-               pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
-                                        _mm_movepi64_pi64 (xmm_alpha),
-                                        expand565_16_1x64 (d))));
+               pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
+                                          expand565_16_1x128 (d))));
        }
     }
 
-    _mm_empty ();
 }
 
-/* ------------------------------
- * composite_add_n_8888_8888_ca
- */
 static void
 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
-                                  pixman_op_t              op,
-                                  pixman_image_t *         src_image,
-                                  pixman_image_t *         mask_image,
-                                  pixman_image_t *         dst_image,
-                                  int32_t                  src_x,
-                                  int32_t                  src_y,
-                                  int32_t                  mask_x,
-                                  int32_t                  mask_y,
-                                  int32_t                  dest_x,
-                                  int32_t                  dest_y,
-                                  int32_t                  width,
-                                  int32_t                  height)
+                                  pixman_composite_info_t *info)
 {
-    uint32_t src, srca;
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
     uint32_t    *dst_line, d;
     uint32_t    *mask_line, m;
     uint32_t pack_cmp;
     int dst_stride, mask_stride;
 
-    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_src;
     __m128i xmm_dst;
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
 
-    __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+    __m128i mmx_src, mmx_mask, mmx_dest;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-    srca = src >> 24;
-    
     if (src == 0)
        return;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
 
     xmm_src = _mm_unpacklo_epi8 (
        create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
-    xmm_alpha = expand_alpha_1x128 (xmm_src);
-    mmx_src   = _mm_movepi64_pi64 (xmm_src);
-    mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+    mmx_src   = xmm_src;
 
     while (height--)
     {
@@ -3148,10 +2542,6 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
        dst_line += dst_stride;
        mask_line += mask_stride;
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)pd);
-       cache_prefetch ((__m128i*)pm);
-
        while (w && (unsigned long)pd & 15)
        {
            m = *pm++;
@@ -3159,28 +2549,21 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
            if (m)
            {
                d = *pd;
-               
-               mmx_mask = unpack_32_1x64 (m);
-               mmx_dest = unpack_32_1x64 (d);
 
-               *pd = pack_1x64_32 (
-                   _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
+               mmx_mask = unpack_32_1x128 (m);
+               mmx_dest = unpack_32_1x128 (d);
+
+               *pd = pack_1x128_32 (
+                   _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
+                                  mmx_dest));
            }
 
            pd++;
            w--;
        }
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)pd);
-       cache_prefetch ((__m128i*)pm);
-
        while (w >= 4)
        {
-           /* fill cache line with next memory */
-           cache_prefetch_next ((__m128i*)pd);
-           cache_prefetch_next ((__m128i*)pm);
-
            xmm_mask = load_128_unaligned ((__m128i*)pm);
 
            pack_cmp =
@@ -3198,7 +2581,7 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
                                    &xmm_mask_lo, &xmm_mask_hi,
                                    &xmm_mask_lo, &xmm_mask_hi);
                xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
-               
+
                save_128_aligned (
                    (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
            }
@@ -3215,12 +2598,13 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
            if (m)
            {
                d = *pd;
-               
-               mmx_mask = unpack_32_1x64 (m);
-               mmx_dest = unpack_32_1x64 (d);
 
-               *pd = pack_1x64_32 (
-                   _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
+               mmx_mask = unpack_32_1x128 (m);
+               mmx_dest = unpack_32_1x128 (d);
+
+               *pd = pack_1x128_32 (
+                   _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
+                                  mmx_dest));
            }
 
            pd++;
@@ -3228,28 +2612,13 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
        }
     }
 
-    _mm_empty ();
 }
 
-/* ---------------------------------------------------------------------------
- * composite_over_n_8888_8888_ca
- */
-
 static void
 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
-                                    pixman_op_t              op,
-                                    pixman_image_t *         src_image,
-                                    pixman_image_t *         mask_image,
-                                    pixman_image_t *         dst_image,
-                                    int32_t                  src_x,
-                                    int32_t                  src_y,
-                                    int32_t                  mask_x,
-                                    int32_t                  mask_y,
-                                    int32_t                  dest_x,
-                                    int32_t                  dest_y,
-                                    int32_t                  width,
-                                    int32_t                  height)
+                                    pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t src;
     uint32_t    *dst_line, d;
     uint32_t    *mask_line, m;
@@ -3260,23 +2629,23 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
 
-    __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
 
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     if (src == 0)
        return;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
 
     xmm_src = _mm_unpacklo_epi8 (
        create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
     xmm_alpha = expand_alpha_1x128 (xmm_src);
-    mmx_src   = _mm_movepi64_pi64 (xmm_src);
-    mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+    mmx_src   = xmm_src;
+    mmx_alpha = xmm_alpha;
 
     while (height--)
     {
@@ -3287,10 +2656,6 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
        dst_line += dst_stride;
        mask_line += mask_stride;
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)pd);
-       cache_prefetch ((__m128i*)pm);
-
        while (w && (unsigned long)pd & 15)
        {
            m = *pm++;
@@ -3298,10 +2663,10 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
            if (m)
            {
                d = *pd;
-               mmx_mask = unpack_32_1x64 (m);
-               mmx_dest = unpack_32_1x64 (d);
+               mmx_mask = unpack_32_1x128 (m);
+               mmx_dest = unpack_32_1x128 (d);
 
-               *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
+               *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
                                                  &mmx_alpha,
                                                  &mmx_mask,
                                                  &mmx_dest));
@@ -3311,16 +2676,8 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
            w--;
        }
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)pd);
-       cache_prefetch ((__m128i*)pm);
-
        while (w >= 4)
        {
-           /* fill cache line with next memory */
-           cache_prefetch_next ((__m128i*)pd);
-           cache_prefetch_next ((__m128i*)pm);
-
            xmm_mask = load_128_unaligned ((__m128i*)pm);
 
            pack_cmp =
@@ -3356,11 +2713,11 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
            if (m)
            {
                d = *pd;
-               mmx_mask = unpack_32_1x64 (m);
-               mmx_dest = unpack_32_1x64 (d);
+               mmx_mask = unpack_32_1x128 (m);
+               mmx_dest = unpack_32_1x128 (d);
 
-               *pd = pack_1x64_32 (
-                   in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
+               *pd = pack_1x128_32 (
+                   in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
            }
 
            pd++;
@@ -3368,32 +2725,17 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
        }
     }
 
-    _mm_empty ();
 }
 
-/*---------------------------------------------------------------------
- * composite_over_8888_n_8888
- */
-
 static void
 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 pixman_image_t *         src_image,
-                                 pixman_image_t *         mask_image,
-                                 pixman_image_t *         dst_image,
-                                 int32_t                  src_x,
-                                 int32_t                  src_y,
-                                 int32_t                  mask_x,
-                                 int32_t                  mask_y,
-                                 int32_t                  dest_x,
-                                 int32_t                  dest_y,
-                                 int32_t                  width,
-                                 int32_t                  height)
+                                 pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t    *dst_line, *dst;
     uint32_t    *src_line, *src;
     uint32_t mask;
-    uint16_t w;
+    int32_t w;
     int dst_stride, src_stride;
 
     __m128i xmm_mask;
@@ -3402,11 +2744,11 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
     __m128i xmm_alpha_lo, xmm_alpha_hi;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
-    mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
+    mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
 
     xmm_mask = create_mask_16_128 (mask >> 24);
 
@@ -3418,111 +2760,156 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
        src_line += src_stride;
        w = width;
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)dst);
-       cache_prefetch ((__m128i*)src);
-
        while (w && (unsigned long)dst & 15)
        {
            uint32_t s = *src++;
-           uint32_t d = *dst;
-
-           __m64 ms = unpack_32_1x64 (s);
-           __m64 alpha    = expand_alpha_1x64 (ms);
-           __m64 dest     = _mm_movepi64_pi64 (xmm_mask);
-           __m64 alpha_dst = unpack_32_1x64 (d);
-
-           *dst++ = pack_1x64_32 (
-               in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
 
+           if (s)
+           {
+               uint32_t d = *dst;
+               
+               __m128i ms = unpack_32_1x128 (s);
+               __m128i alpha    = expand_alpha_1x128 (ms);
+               __m128i dest     = xmm_mask;
+               __m128i alpha_dst = unpack_32_1x128 (d);
+               
+               *dst = pack_1x128_32 (
+                   in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
+           }
+           dst++;
            w--;
        }
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)dst);
-       cache_prefetch ((__m128i*)src);
-
        while (w >= 4)
        {
-           /* fill cache line with next memory */
-           cache_prefetch_next ((__m128i*)dst);
-           cache_prefetch_next ((__m128i*)src);
-
            xmm_src = load_128_unaligned ((__m128i*)src);
-           xmm_dst = load_128_aligned ((__m128i*)dst);
-
-           unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
-           unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-           expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
-                               &xmm_alpha_lo, &xmm_alpha_hi);
-
-           in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
-                          &xmm_alpha_lo, &xmm_alpha_hi,
-                          &xmm_mask, &xmm_mask,
-                          &xmm_dst_lo, &xmm_dst_hi);
-
-           save_128_aligned (
-               (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 
-           dst += 4;
-           src += 4;
-           w -= 4;
-       }
+           if (!is_zero (xmm_src))
+           {
+               xmm_dst = load_128_aligned ((__m128i*)dst);
+               
+               unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+               unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+               expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+                                   &xmm_alpha_lo, &xmm_alpha_hi);
+               
+               in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+                              &xmm_alpha_lo, &xmm_alpha_hi,
+                              &xmm_mask, &xmm_mask,
+                              &xmm_dst_lo, &xmm_dst_hi);
+               
+               save_128_aligned (
+                   (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+           }
+               
+           dst += 4;
+           src += 4;
+           w -= 4;
+       }
 
        while (w)
        {
            uint32_t s = *src++;
-           uint32_t d = *dst;
 
-           __m64 ms = unpack_32_1x64 (s);
-           __m64 alpha = expand_alpha_1x64 (ms);
-           __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
-           __m64 dest  = unpack_32_1x64 (d);
+           if (s)
+           {
+               uint32_t d = *dst;
+               
+               __m128i ms = unpack_32_1x128 (s);
+               __m128i alpha = expand_alpha_1x128 (ms);
+               __m128i mask  = xmm_mask;
+               __m128i dest  = unpack_32_1x128 (d);
+               
+               *dst = pack_1x128_32 (
+                   in_over_1x128 (&ms, &alpha, &mask, &dest));
+           }
+
+           dst++;
+           w--;
+       }
+    }
+
+}
+
+static void
+sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
+                             pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int32_t w;
+    int dst_stride, src_stride;
+
+
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       while (w && (unsigned long)dst & 15)
+       {
+           *dst++ = *src++ | 0xff000000;
+           w--;
+       }
 
-           *dst++ = pack_1x64_32 (
-               in_over_1x64 (&ms, &alpha, &mask, &dest));
+       while (w >= 16)
+       {
+           __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
+           
+           xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
+           xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
+           xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
+           xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
+           
+           save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
+           save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
+           save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
+           save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
+           
+           dst += 16;
+           src += 16;
+           w -= 16;
+       }
 
+       while (w)
+       {
+           *dst++ = *src++ | 0xff000000;
            w--;
        }
     }
 
-    _mm_empty ();
 }
 
-/* ---------------------------------------------------------------------
- * composite_over_x888_n_8888
- */
 static void
 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 pixman_image_t *         src_image,
-                                 pixman_image_t *         mask_image,
-                                 pixman_image_t *         dst_image,
-                                 int32_t                  src_x,
-                                 int32_t                  src_y,
-                                 int32_t                  mask_x,
-                                 int32_t                  mask_y,
-                                 int32_t                  dest_x,
-                                 int32_t                  dest_y,
-                                 int32_t                  width,
-                                 int32_t                  height)
+                                 pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t    *dst_line, *dst;
     uint32_t    *src_line, *src;
     uint32_t mask;
     int dst_stride, src_stride;
-    uint16_t w;
+    int32_t w;
 
     __m128i xmm_mask, xmm_alpha;
     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
-    mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
+    mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
 
     xmm_mask = create_mask_16_128 (mask >> 24);
     xmm_alpha = mask_00ff;
@@ -3535,36 +2922,24 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
        src_line += src_stride;
        w = width;
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)dst);
-       cache_prefetch ((__m128i*)src);
-
        while (w && (unsigned long)dst & 15)
        {
            uint32_t s = (*src++) | 0xff000000;
            uint32_t d = *dst;
 
-           __m64 src   = unpack_32_1x64 (s);
-           __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
-           __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
-           __m64 dest  = unpack_32_1x64 (d);
+           __m128i src   = unpack_32_1x128 (s);
+           __m128i alpha = xmm_alpha;
+           __m128i mask  = xmm_mask;
+           __m128i dest  = unpack_32_1x128 (d);
 
-           *dst++ = pack_1x64_32 (
-               in_over_1x64 (&src, &alpha, &mask, &dest));
+           *dst++ = pack_1x128_32 (
+               in_over_1x128 (&src, &alpha, &mask, &dest));
 
            w--;
        }
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)dst);
-       cache_prefetch ((__m128i*)src);
-
        while (w >= 4)
        {
-           /* fill cache line with next memory */
-           cache_prefetch_next ((__m128i*)dst);
-           cache_prefetch_next ((__m128i*)src);
-
            xmm_src = _mm_or_si128 (
                load_128_unaligned ((__m128i*)src), mask_ff000000);
            xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -3591,45 +2966,31 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
            uint32_t s = (*src++) | 0xff000000;
            uint32_t d = *dst;
 
-           __m64 src  = unpack_32_1x64 (s);
-           __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
-           __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
-           __m64 dest  = unpack_32_1x64 (d);
+           __m128i src  = unpack_32_1x128 (s);
+           __m128i alpha = xmm_alpha;
+           __m128i mask  = xmm_mask;
+           __m128i dest  = unpack_32_1x128 (d);
 
-           *dst++ = pack_1x64_32 (
-               in_over_1x64 (&src, &alpha, &mask, &dest));
+           *dst++ = pack_1x128_32 (
+               in_over_1x128 (&src, &alpha, &mask, &dest));
 
            w--;
        }
     }
 
-    _mm_empty ();
 }
 
-/* --------------------------------------------------------------------
- * composite_over_8888_8888
- */
 static void
 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
-                               pixman_op_t              op,
-                               pixman_image_t *         src_image,
-                               pixman_image_t *         mask_image,
-                               pixman_image_t *         dst_image,
-                               int32_t                  src_x,
-                               int32_t                  src_y,
-                               int32_t                  mask_x,
-                               int32_t                  mask_y,
-                               int32_t                  dest_x,
-                               int32_t                  dest_y,
-                               int32_t                  width,
-                               int32_t                  height)
+                               pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     int dst_stride, src_stride;
     uint32_t    *dst_line, *dst;
     uint32_t    *src_line, *src;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
@@ -3638,76 +2999,49 @@ sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
 
     while (height--)
     {
-       core_combine_over_u_sse2 (dst, src, NULL, width);
+       sse2_combine_over_u (imp, op, dst, src, NULL, width);
 
        dst += dst_stride;
        src += src_stride;
     }
-    _mm_empty ();
 }
 
-/* ------------------------------------------------------------------
- * composite_over_8888_0565
- */
 static force_inline uint16_t
 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
 {
-    __m64 ms;
+    __m128i ms;
 
-    ms = unpack_32_1x64 (src);
+    ms = unpack_32_1x128 (src);
     return pack_565_32_16 (
-       pack_1x64_32 (
-           over_1x64 (
-               ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
+       pack_1x128_32 (
+           over_1x128 (
+               ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
 }
 
 static void
 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
-                               pixman_op_t              op,
-                               pixman_image_t *         src_image,
-                               pixman_image_t *         mask_image,
-                               pixman_image_t *         dst_image,
-                               int32_t                  src_x,
-                               int32_t                  src_y,
-                               int32_t                  mask_x,
-                               int32_t                  mask_y,
-                               int32_t                  dest_x,
-                               int32_t                  dest_y,
-                               int32_t                  width,
-                               int32_t                  height)
+                               pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint16_t    *dst_line, *dst, d;
     uint32_t    *src_line, *src, s;
     int dst_stride, src_stride;
-    uint16_t w;
+    int32_t w;
 
     __m128i xmm_alpha_lo, xmm_alpha_hi;
     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
-#if 0
-    /* FIXME
-     *
-     * I copy the code from MMX one and keep the fixme.
-     * If it's a problem there, probably is a problem here.
-     */
-    assert (src_image->drawable == mask_image->drawable);
-#endif
-
     while (height--)
     {
        dst = dst_line;
        src = src_line;
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)src);
-       cache_prefetch ((__m128i*)dst);
-
        dst_line += dst_stride;
        src_line += src_stride;
        w = width;
@@ -3723,17 +3057,9 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
            w--;
        }
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)src);
-       cache_prefetch ((__m128i*)dst);
-
        /* It's a 8 pixel loop */
        while (w >= 8)
        {
-           /* fill cache line with next memory */
-           cache_prefetch_next ((__m128i*)src);
-           cache_prefetch_next ((__m128i*)dst);
-
            /* I'm loading unaligned because I'm not sure
             * about the address alignment.
             */
@@ -3783,57 +3109,42 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
        }
     }
 
-    _mm_empty ();
 }
 
-/* -----------------------------------------------------------------
- * composite_over_n_8_8888
- */
-
 static void
 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
+                              pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t src, srca;
     uint32_t *dst_line, *dst;
     uint8_t *mask_line, *mask;
     int dst_stride, mask_stride;
-    uint16_t w;
+    int32_t w;
     uint32_t m, d;
 
     __m128i xmm_src, xmm_alpha, xmm_def;
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
 
-    __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
 
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     srca = src >> 24;
     if (src == 0)
        return;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
     xmm_def = create_mask_2x32_128 (src, src);
     xmm_src = expand_pixel_32_1x128 (src);
     xmm_alpha = expand_alpha_1x128 (xmm_src);
-    mmx_src   = _mm_movepi64_pi64 (xmm_src);
-    mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+    mmx_src   = xmm_src;
+    mmx_alpha = xmm_alpha;
 
     while (height--)
     {
@@ -3843,10 +3154,6 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
        mask_line += mask_stride;
        w = width;
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)mask);
-       cache_prefetch ((__m128i*)dst);
-
        while (w && (unsigned long)dst & 15)
        {
            uint8_t m = *mask++;
@@ -3854,10 +3161,10 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
            if (m)
            {
                d = *dst;
-               mmx_mask = expand_pixel_8_1x64 (m);
-               mmx_dest = unpack_32_1x64 (d);
+               mmx_mask = expand_pixel_8_1x128 (m);
+               mmx_dest = unpack_32_1x128 (d);
 
-               *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
+               *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
                                                   &mmx_alpha,
                                                   &mmx_mask,
                                                   &mmx_dest));
@@ -3867,16 +3174,8 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
            dst++;
        }
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)mask);
-       cache_prefetch ((__m128i*)dst);
-
        while (w >= 4)
        {
-           /* fill cache line with next memory */
-           cache_prefetch_next ((__m128i*)mask);
-           cache_prefetch_next ((__m128i*)dst);
-
            m = *((uint32_t*)mask);
 
            if (srca == 0xff && m == 0xffffffff)
@@ -3917,10 +3216,10 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
            if (m)
            {
                d = *dst;
-               mmx_mask = expand_pixel_8_1x64 (m);
-               mmx_dest = unpack_32_1x64 (d);
+               mmx_mask = expand_pixel_8_1x128 (m);
+               mmx_dest = unpack_32_1x128 (d);
 
-               *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
+               *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
                                                   &mmx_alpha,
                                                   &mmx_mask,
                                                   &mmx_dest));
@@ -3931,14 +3230,9 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
        }
     }
 
-    _mm_empty ();
 }
 
-/* ----------------------------------------------------------------
- * composite_over_n_8_8888
- */
-
-pixman_bool_t
+static pixman_bool_t
 pixman_fill_sse2 (uint32_t *bits,
                   int       stride,
                   int       bpp,
@@ -3953,28 +3247,41 @@ pixman_fill_sse2 (uint32_t *bits,
 
     __m128i xmm_def;
 
-    if (bpp == 16 && (data >> 16 != (data & 0xffff)))
-       return FALSE;
+    if (bpp == 8)
+    {
+       uint8_t b;
+       uint16_t w;
 
-    if (bpp != 16 && bpp != 32)
-       return FALSE;
+       stride = stride * (int) sizeof (uint32_t) / 1;
+       byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
+       byte_width = width;
+       stride *= 1;
 
-    if (bpp == 16)
+       b = data & 0xff;
+       w = (b << 8) | b;
+       data = (w << 16) | w;
+    }
+    else if (bpp == 16)
     {
        stride = stride * (int) sizeof (uint32_t) / 2;
        byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
        byte_width = 2 * width;
        stride *= 2;
+
+        data = (data & 0xffff) * 0x00010001;
     }
-    else
+    else if (bpp == 32)
     {
        stride = stride * (int) sizeof (uint32_t) / 4;
        byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
        byte_width = 4 * width;
        stride *= 4;
     }
+    else
+    {
+       return FALSE;
+    }
 
-    cache_prefetch ((__m128i*)byte_line);
     xmm_def = create_mask_2x32_128 (data, data);
 
     while (height--)
@@ -3984,8 +3291,12 @@ pixman_fill_sse2 (uint32_t *bits,
        byte_line += stride;
        w = byte_width;
 
-
-       cache_prefetch_next ((__m128i*)d);
+       if (w >= 1 && ((unsigned long)d & 1))
+       {
+           *(uint8_t *)d = data;
+           w -= 1;
+           d += 1;
+       }
 
        while (w >= 2 && ((unsigned long)d & 3))
        {
@@ -4002,12 +3313,8 @@ pixman_fill_sse2 (uint32_t *bits,
            d += 4;
        }
 
-       cache_prefetch_next ((__m128i*)d);
-
        while (w >= 128)
        {
-           cache_prefetch (((__m128i*)d) + 12);
-
            save_128_aligned ((__m128i*)(d),     xmm_def);
            save_128_aligned ((__m128i*)(d + 16),  xmm_def);
            save_128_aligned ((__m128i*)(d + 32),  xmm_def);
@@ -4023,8 +3330,6 @@ pixman_fill_sse2 (uint32_t *bits,
 
        if (w >= 64)
        {
-           cache_prefetch (((__m128i*)d) + 8);
-
            save_128_aligned ((__m128i*)(d),     xmm_def);
            save_128_aligned ((__m128i*)(d + 16),  xmm_def);
            save_128_aligned ((__m128i*)(d + 32),  xmm_def);
@@ -4034,8 +3339,6 @@ pixman_fill_sse2 (uint32_t *bits,
            w -= 64;
        }
 
-       cache_prefetch_next ((__m128i*)d);
-
        if (w >= 32)
        {
            save_128_aligned ((__m128i*)(d),     xmm_def);
@@ -4053,8 +3356,6 @@ pixman_fill_sse2 (uint32_t *bits,
            w -= 16;
        }
 
-       cache_prefetch_next ((__m128i*)d);
-
        while (w >= 4)
        {
            *(uint32_t *)d = data;
@@ -4069,50 +3370,46 @@ pixman_fill_sse2 (uint32_t *bits,
            w -= 2;
            d += 2;
        }
+
+       if (w >= 1)
+       {
+           *(uint8_t *)d = data;
+           w -= 1;
+           d += 1;
+       }
     }
 
-    _mm_empty ();
     return TRUE;
 }
 
 static void
 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
-                             pixman_op_t              op,
-                             pixman_image_t *         src_image,
-                             pixman_image_t *         mask_image,
-                             pixman_image_t *         dst_image,
-                             int32_t                  src_x,
-                             int32_t                  src_y,
-                             int32_t                  mask_x,
-                             int32_t                  mask_y,
-                             int32_t                  dest_x,
-                             int32_t                  dest_y,
-                             int32_t                  width,
-                             int32_t                  height)
+                             pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t src, srca;
     uint32_t    *dst_line, *dst;
     uint8_t     *mask_line, *mask;
     int dst_stride, mask_stride;
-    uint16_t w;
+    int32_t w;
     uint32_t m;
 
     __m128i xmm_src, xmm_def;
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
 
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     srca = src >> 24;
     if (src == 0)
     {
-       pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
-                         PIXMAN_FORMAT_BPP (dst_image->bits.format),
+       pixman_fill_sse2 (dest_image->bits.bits, dest_image->bits.rowstride,
+                         PIXMAN_FORMAT_BPP (dest_image->bits.format),
                          dest_x, dest_y, width, height, 0);
        return;
     }
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
@@ -4127,19 +3424,14 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
        mask_line += mask_stride;
        w = width;
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)mask);
-       cache_prefetch ((__m128i*)dst);
-
        while (w && (unsigned long)dst & 15)
        {
            uint8_t m = *mask++;
 
            if (m)
            {
-               *dst = pack_1x64_32 (
-                   pix_multiply_1x64 (
-                       _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
+               *dst = pack_1x128_32 (
+                   pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
            }
            else
            {
@@ -4150,16 +3442,8 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
            dst++;
        }
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)mask);
-       cache_prefetch ((__m128i*)dst);
-
        while (w >= 4)
        {
-           /* fill cache line with next memory */
-           cache_prefetch_next ((__m128i*)mask);
-           cache_prefetch_next ((__m128i*)dst);
-
            m = *((uint32_t*)mask);
 
            if (srca == 0xff && m == 0xffffffff)
@@ -4200,9 +3484,9 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
 
            if (m)
            {
-               *dst = pack_1x64_32 (
-                   pix_multiply_1x64 (
-                       _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
+               *dst = pack_1x128_32 (
+                   pix_multiply_1x128 (
+                       xmm_src, expand_pixel_8_1x128 (m)));
            }
            else
            {
@@ -4214,55 +3498,39 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
        }
     }
 
-    _mm_empty ();
 }
 
-/*-----------------------------------------------------------------------
- * composite_over_n_8_0565
- */
-
 static void
 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
+                              pixman_composite_info_t *info)
 {
-    uint32_t src, srca;
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
     uint16_t    *dst_line, *dst, d;
     uint8_t     *mask_line, *mask;
     int dst_stride, mask_stride;
-    uint16_t w;
+    int32_t w;
     uint32_t m;
-    __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
 
     __m128i xmm_src, xmm_alpha;
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
 
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
-    srca = src >> 24;
     if (src == 0)
        return;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
     xmm_src = expand_pixel_32_1x128 (src);
     xmm_alpha = expand_alpha_1x128 (xmm_src);
-    mmx_src = _mm_movepi64_pi64 (xmm_src);
-    mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+    mmx_src = xmm_src;
+    mmx_alpha = xmm_alpha;
 
     while (height--)
     {
@@ -4272,10 +3540,6 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
        mask_line += mask_stride;
        w = width;
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)mask);
-       cache_prefetch ((__m128i*)dst);
-
        while (w && (unsigned long)dst & 15)
        {
            m = *mask++;
@@ -4283,12 +3547,12 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
            if (m)
            {
                d = *dst;
-               mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
-               mmx_dest = expand565_16_1x64 (d);
+               mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+               mmx_dest = expand565_16_1x128 (d);
 
                *dst = pack_565_32_16 (
-                   pack_1x64_32 (
-                       in_over_1x64 (
+                   pack_1x128_32 (
+                       in_over_1x128 (
                            &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
            }
 
@@ -4296,16 +3560,8 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
            dst++;
        }
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)mask);
-       cache_prefetch ((__m128i*)dst);
-
        while (w >= 8)
        {
-           /* fill cache line with next memory */
-           cache_prefetch_next ((__m128i*)mask);
-           cache_prefetch_next ((__m128i*)dst);
-
            xmm_dst = load_128_aligned ((__m128i*) dst);
            unpack_565_128_4x128 (xmm_dst,
                                  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
@@ -4364,12 +3620,12 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
            if (m)
            {
                d = *dst;
-               mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
-               mmx_dest = expand565_16_1x64 (d);
+               mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+               mmx_dest = expand565_16_1x128 (d);
 
                *dst = pack_565_32_16 (
-                   pack_1x64_32 (
-                       in_over_1x64 (
+                   pack_1x128_32 (
+                       in_over_1x128 (
                            &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
            }
 
@@ -4378,52 +3634,28 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
        }
     }
 
-    _mm_empty ();
 }
 
-/* -----------------------------------------------------------------------
- * composite_over_pixbuf_0565
- */
-
 static void
 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 pixman_image_t *         src_image,
-                                 pixman_image_t *         mask_image,
-                                 pixman_image_t *         dst_image,
-                                 int32_t                  src_x,
-                                 int32_t                  src_y,
-                                 int32_t                  mask_x,
-                                 int32_t                  mask_y,
-                                 int32_t                  dest_x,
-                                 int32_t                  dest_y,
-                                 int32_t                  width,
-                                 int32_t                  height)
+                                 pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint16_t    *dst_line, *dst, d;
     uint32_t    *src_line, *src, s;
     int dst_stride, src_stride;
-    uint16_t w;
+    int32_t w;
     uint32_t opaque, zero;
 
-    __m64 ms;
+    __m128i ms;
     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
-#if 0
-    /* FIXME
-     *
-     * I copy the code from MMX one and keep the fixme.
-     * If it's a problem there, probably is a problem here.
-     */
-    assert (src_image->drawable == mask_image->drawable);
-#endif
-
     while (height--)
     {
        dst = dst_line;
@@ -4432,33 +3664,21 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
        src_line += src_stride;
        w = width;
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)src);
-       cache_prefetch ((__m128i*)dst);
-
        while (w && (unsigned long)dst & 15)
        {
            s = *src++;
            d = *dst;
 
-           ms = unpack_32_1x64 (s);
+           ms = unpack_32_1x128 (s);
 
            *dst++ = pack_565_32_16 (
-               pack_1x64_32 (
-                   over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
+               pack_1x128_32 (
+                   over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
            w--;
        }
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)src);
-       cache_prefetch ((__m128i*)dst);
-
        while (w >= 8)
        {
-           /* fill cache line with next memory */
-           cache_prefetch_next ((__m128i*)src);
-           cache_prefetch_next ((__m128i*)dst);
-
            /* First round */
            xmm_src = load_128_unaligned ((__m128i*)src);
            xmm_dst = load_128_aligned  ((__m128i*)dst);
@@ -4515,60 +3735,36 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
            s = *src++;
            d = *dst;
 
-           ms = unpack_32_1x64 (s);
+           ms = unpack_32_1x128 (s);
 
            *dst++ = pack_565_32_16 (
-               pack_1x64_32 (
-                   over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
+               pack_1x128_32 (
+                   over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
            w--;
        }
     }
 
-    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------
- * composite_over_pixbuf_8888
- */
-
 static void
 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 pixman_image_t *         src_image,
-                                 pixman_image_t *         mask_image,
-                                 pixman_image_t *         dst_image,
-                                 int32_t                  src_x,
-                                 int32_t                  src_y,
-                                 int32_t                  mask_x,
-                                 int32_t                  mask_y,
-                                 int32_t                  dest_x,
-                                 int32_t                  dest_y,
-                                 int32_t                  width,
-                                 int32_t                  height)
+                                 pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t    *dst_line, *dst, d;
     uint32_t    *src_line, *src, s;
     int dst_stride, src_stride;
-    uint16_t w;
+    int32_t w;
     uint32_t opaque, zero;
 
     __m128i xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst_lo, xmm_dst_hi;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
-#if 0
-    /* FIXME
-     *
-     * I copy the code from MMX one and keep the fixme.
-     * If it's a problem there, probably is a problem here.
-     */
-    assert (src_image->drawable == mask_image->drawable);
-#endif
-
     while (height--)
     {
        dst = dst_line;
@@ -4577,32 +3773,20 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
        src_line += src_stride;
        w = width;
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)src);
-       cache_prefetch ((__m128i*)dst);
-
        while (w && (unsigned long)dst & 15)
        {
            s = *src++;
            d = *dst;
 
-           *dst++ = pack_1x64_32 (
-               over_rev_non_pre_1x64 (
-                   unpack_32_1x64 (s), unpack_32_1x64 (d)));
+           *dst++ = pack_1x128_32 (
+               over_rev_non_pre_1x128 (
+                   unpack_32_1x128 (s), unpack_32_1x128 (d)));
 
            w--;
        }
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)src);
-       cache_prefetch ((__m128i*)dst);
-
        while (w >= 4)
        {
-           /* fill cache line with next memory */
-           cache_prefetch_next ((__m128i*)src);
-           cache_prefetch_next ((__m128i*)dst);
-
            xmm_src_hi = load_128_unaligned ((__m128i*)src);
 
            opaque = is_opaque (xmm_src_hi);
@@ -4641,36 +3825,21 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
            s = *src++;
            d = *dst;
 
-           *dst++ = pack_1x64_32 (
-               over_rev_non_pre_1x64 (
-                   unpack_32_1x64 (s), unpack_32_1x64 (d)));
+           *dst++ = pack_1x128_32 (
+               over_rev_non_pre_1x128 (
+                   unpack_32_1x128 (s), unpack_32_1x128 (d)));
 
            w--;
        }
     }
 
-    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * composite_over_n_8888_0565_ca
- */
-
 static void
 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
-                                    pixman_op_t              op,
-                                    pixman_image_t *         src_image,
-                                    pixman_image_t *         mask_image,
-                                    pixman_image_t *         dst_image,
-                                    int32_t                  src_x,
-                                    int32_t                  src_y,
-                                    int32_t                  mask_x,
-                                    int32_t                  mask_y,
-                                    int32_t                  dest_x,
-                                    int32_t                  dest_y,
-                                    int32_t                  width,
-                                    int32_t                  height)
+                                    pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t src;
     uint16_t    *dst_line, *dst, d;
     uint32_t    *mask_line, *mask, m;
@@ -4682,22 +3851,22 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
 
-    __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
 
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     if (src == 0)
        return;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
 
     xmm_src = expand_pixel_32_1x128 (src);
     xmm_alpha = expand_alpha_1x128 (xmm_src);
-    mmx_src = _mm_movepi64_pi64 (xmm_src);
-    mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+    mmx_src = xmm_src;
+    mmx_alpha = xmm_alpha;
 
     while (height--)
     {
@@ -4707,10 +3876,6 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
        mask_line += mask_stride;
        dst_line += dst_stride;
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)mask);
-       cache_prefetch ((__m128i*)dst);
-
        while (w && ((unsigned long)dst & 15))
        {
            m = *(uint32_t *) mask;
@@ -4718,12 +3883,12 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
            if (m)
            {
                d = *dst;
-               mmx_mask = unpack_32_1x64 (m);
-               mmx_dest = expand565_16_1x64 (d);
+               mmx_mask = unpack_32_1x128 (m);
+               mmx_dest = expand565_16_1x128 (d);
 
                *dst = pack_565_32_16 (
-                   pack_1x64_32 (
-                       in_over_1x64 (
+                   pack_1x128_32 (
+                       in_over_1x128 (
                            &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
            }
 
@@ -4732,16 +3897,8 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
            mask++;
        }
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)mask);
-       cache_prefetch ((__m128i*)dst);
-
        while (w >= 8)
        {
-           /* fill cache line with next memory */
-           cache_prefetch_next ((__m128i*)mask);
-           cache_prefetch_next ((__m128i*)dst);
-
            /* First round */
            xmm_mask = load_128_unaligned ((__m128i*)mask);
            xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -4795,12 +3952,12 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
            if (m)
            {
                d = *dst;
-               mmx_mask = unpack_32_1x64 (m);
-               mmx_dest = expand565_16_1x64 (d);
+               mmx_mask = unpack_32_1x128 (m);
+               mmx_dest = expand565_16_1x128 (d);
 
                *dst = pack_565_32_16 (
-                   pack_1x64_32 (
-                       in_over_1x64 (
+                   pack_1x128_32 (
+                       in_over_1x128 (
                            &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
            }
 
@@ -4810,47 +3967,30 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
        }
     }
 
-    _mm_empty ();
 }
 
-/* -----------------------------------------------------------------------
- * composite_in_n_8_8
- */
-
 static void
 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         pixman_image_t *         src_image,
-                         pixman_image_t *         mask_image,
-                         pixman_image_t *         dst_image,
-                         int32_t                  src_x,
-                         int32_t                  src_y,
-                         int32_t                  mask_x,
-                         int32_t                  mask_y,
-                         int32_t                  dest_x,
-                         int32_t                  dest_y,
-                         int32_t                  width,
-                         int32_t                  height)
+                         pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint8_t     *dst_line, *dst;
     uint8_t     *mask_line, *mask;
     int dst_stride, mask_stride;
-    uint16_t w, d, m;
+    uint32_t d, m;
     uint32_t src;
-    uint8_t sa;
+    int32_t w;
 
     __m128i xmm_alpha;
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
-    sa = src >> 24;
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
 
@@ -4862,33 +4002,21 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
        mask_line += mask_stride;
        w = width;
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)mask);
-       cache_prefetch ((__m128i*)dst);
-
        while (w && ((unsigned long)dst & 15))
        {
            m = (uint32_t) *mask++;
            d = (uint32_t) *dst;
 
-           *dst++ = (uint8_t) pack_1x64_32 (
-               pix_multiply_1x64 (
-                   pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
-                                      unpack_32_1x64 (m)),
-                   unpack_32_1x64 (d)));
+           *dst++ = (uint8_t) pack_1x128_32 (
+               pix_multiply_1x128 (
+                   pix_multiply_1x128 (xmm_alpha,
+                                      unpack_32_1x128 (m)),
+                   unpack_32_1x128 (d)));
            w--;
        }
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)mask);
-       cache_prefetch ((__m128i*)dst);
-
        while (w >= 16)
        {
-           /* fill cache line with next memory */
-           cache_prefetch_next ((__m128i*)mask);
-           cache_prefetch_next ((__m128i*)dst);
-
            xmm_mask = load_128_unaligned ((__m128i*)mask);
            xmm_dst = load_128_aligned ((__m128i*)dst);
 
@@ -4916,48 +4044,115 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
            m = (uint32_t) *mask++;
            d = (uint32_t) *dst;
 
-           *dst++ = (uint8_t) pack_1x64_32 (
-               pix_multiply_1x64 (
-                   pix_multiply_1x64 (
-                       _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
-                   unpack_32_1x64 (d)));
+           *dst++ = (uint8_t) pack_1x128_32 (
+               pix_multiply_1x128 (
+                   pix_multiply_1x128 (
+                       xmm_alpha, unpack_32_1x128 (m)),
+                   unpack_32_1x128 (d)));
            w--;
        }
     }
 
-    _mm_empty ();
 }
 
-/* ---------------------------------------------------------------------------
- * composite_in_8_8
- */
+static void
+sse2_composite_in_n_8 (pixman_implementation_t *imp,
+                      pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    int dst_stride;
+    uint32_t d;
+    uint32_t src;
+    int32_t w;
+
+    __m128i xmm_alpha;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
+
+    src = src >> 24;
+
+    if (src == 0xff)
+       return;
+
+    if (src == 0x00)
+    {
+       pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
+                    8, dest_x, dest_y, width, height, src);
+
+       return;
+    }
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       w = width;
+
+       while (w && ((unsigned long)dst & 15))
+       {
+           d = (uint32_t) *dst;
+
+           *dst++ = (uint8_t) pack_1x128_32 (
+               pix_multiply_1x128 (
+                   xmm_alpha,
+                   unpack_32_1x128 (d)));
+           w--;
+       }
+
+       while (w >= 16)
+       {
+           xmm_dst = load_128_aligned ((__m128i*)dst);
+
+           unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+           
+           pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+                               &xmm_dst_lo, &xmm_dst_hi,
+                               &xmm_dst_lo, &xmm_dst_hi);
+
+           save_128_aligned (
+               (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+           dst += 16;
+           w -= 16;
+       }
+
+       while (w)
+       {
+           d = (uint32_t) *dst;
+
+           *dst++ = (uint8_t) pack_1x128_32 (
+               pix_multiply_1x128 (
+                   xmm_alpha,
+                   unpack_32_1x128 (d)));
+           w--;
+       }
+    }
+
+}
 
 static void
 sse2_composite_in_8_8 (pixman_implementation_t *imp,
-                       pixman_op_t              op,
-                       pixman_image_t *         src_image,
-                       pixman_image_t *         mask_image,
-                       pixman_image_t *         dst_image,
-                       int32_t                  src_x,
-                       int32_t                  src_y,
-                       int32_t                  mask_x,
-                       int32_t                  mask_y,
-                       int32_t                  dest_x,
-                       int32_t                  dest_y,
-                       int32_t                  width,
-                       int32_t                  height)
+                       pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint8_t     *dst_line, *dst;
     uint8_t     *src_line, *src;
     int src_stride, dst_stride;
-    uint16_t w;
+    int32_t w;
     uint32_t s, d;
 
     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
 
@@ -4969,31 +4164,19 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp,
        src_line += src_stride;
        w = width;
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)src);
-       cache_prefetch ((__m128i*)dst);
-
        while (w && ((unsigned long)dst & 15))
        {
            s = (uint32_t) *src++;
            d = (uint32_t) *dst;
 
-           *dst++ = (uint8_t) pack_1x64_32 (
-               pix_multiply_1x64 (
-                   unpack_32_1x64 (s), unpack_32_1x64 (d)));
+           *dst++ = (uint8_t) pack_1x128_32 (
+               pix_multiply_1x128 (
+                   unpack_32_1x128 (s), unpack_32_1x128 (d)));
            w--;
        }
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)src);
-       cache_prefetch ((__m128i*)dst);
-
        while (w >= 16)
        {
-           /* fill cache line with next memory */
-           cache_prefetch_next ((__m128i*)src);
-           cache_prefetch_next ((__m128i*)dst);
-
            xmm_src = load_128_unaligned ((__m128i*)src);
            xmm_dst = load_128_aligned ((__m128i*)dst);
 
@@ -5017,40 +4200,24 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp,
            s = (uint32_t) *src++;
            d = (uint32_t) *dst;
 
-           *dst++ = (uint8_t) pack_1x64_32 (
-               pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
+           *dst++ = (uint8_t) pack_1x128_32 (
+               pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
            w--;
        }
     }
 
-    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------
- * composite_add_n_8_8
- */
-
 static void
 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         pixman_image_t *         src_image,
-                         pixman_image_t *         mask_image,
-                         pixman_image_t *         dst_image,
-                         int32_t                  src_x,
-                         int32_t                  src_y,
-                         int32_t                  mask_x,
-                         int32_t                  mask_y,
-                         int32_t                  dest_x,
-                         int32_t                  dest_y,
-                         int32_t                  width,
-                         int32_t                  height)
+                         pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint8_t     *dst_line, *dst;
     uint8_t     *mask_line, *mask;
     int dst_stride, mask_stride;
-    uint16_t w;
+    int32_t w;
     uint32_t src;
-    uint8_t sa;
     uint32_t m, d;
 
     __m128i xmm_alpha;
@@ -5058,13 +4225,11 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
-    sa = src >> 24;
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
 
@@ -5076,33 +4241,21 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
        mask_line += mask_stride;
        w = width;
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)mask);
-       cache_prefetch ((__m128i*)dst);
-
        while (w && ((unsigned long)dst & 15))
        {
            m = (uint32_t) *mask++;
            d = (uint32_t) *dst;
 
-           *dst++ = (uint8_t) pack_1x64_32 (
-               _mm_adds_pu16 (
-                   pix_multiply_1x64 (
-                       _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
-                   unpack_32_1x64 (d)));
+           *dst++ = (uint8_t) pack_1x128_32 (
+               _mm_adds_epu16 (
+                   pix_multiply_1x128 (
+                       xmm_alpha, unpack_32_1x128 (m)),
+                   unpack_32_1x128 (d)));
            w--;
        }
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)mask);
-       cache_prefetch ((__m128i*)dst);
-
        while (w >= 16)
        {
-           /* fill cache line with next memory */
-           cache_prefetch_next ((__m128i*)mask);
-           cache_prefetch_next ((__m128i*)dst);
-
            xmm_mask = load_128_unaligned ((__m128i*)mask);
            xmm_dst = load_128_aligned ((__m128i*)dst);
 
@@ -5129,107 +4282,148 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
            m = (uint32_t) *mask++;
            d = (uint32_t) *dst;
 
-           *dst++ = (uint8_t) pack_1x64_32 (
-               _mm_adds_pu16 (
-                   pix_multiply_1x64 (
-                       _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
-                   unpack_32_1x64 (d)));
+           *dst++ = (uint8_t) pack_1x128_32 (
+               _mm_adds_epu16 (
+                   pix_multiply_1x128 (
+                       xmm_alpha, unpack_32_1x128 (m)),
+                   unpack_32_1x128 (d)));
 
            w--;
        }
     }
 
-    _mm_empty ();
 }
 
-/* ----------------------------------------------------------------------
- * composite_add_8000_8000
- */
-
 static void
-sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
+sse2_composite_add_n_8 (pixman_implementation_t *imp,
+                       pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint8_t     *dst_line, *dst;
-    uint8_t     *src_line, *src;
-    int dst_stride, src_stride;
-    uint16_t w;
-    uint16_t t;
+    int dst_stride;
+    int32_t w;
+    uint32_t src;
+
+    __m128i xmm_src;
 
     PIXMAN_IMAGE_GET_LINE (
-       src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
 
-    while (height--)
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    src >>= 24;
+
+    if (src == 0x00)
+       return;
+
+    if (src == 0xff)
     {
-       dst = dst_line;
-       src = src_line;
+       pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
+                    8, dest_x, dest_y, width, height, 0xff);
+
+       return;
+    }
 
-       /* call prefetch hint to optimize cache load*/
-       cache_prefetch ((__m128i*)src);
-       cache_prefetch ((__m128i*)dst);
+    src = (src << 24) | (src << 16) | (src << 8) | src;
+    xmm_src = _mm_set_epi32 (src, src, src, src);
 
+    while (height--)
+    {
+       dst = dst_line;
        dst_line += dst_stride;
-       src_line += src_stride;
        w = width;
 
-       /* Small head */
-       while (w && (unsigned long)dst & 3)
+       while (w && ((unsigned long)dst & 15))
        {
-           t = (*dst) + (*src++);
-           *dst++ = t | (0 - (t >> 8));
+           *dst = (uint8_t)_mm_cvtsi128_si32 (
+               _mm_adds_epu8 (
+                   xmm_src,
+                   _mm_cvtsi32_si128 (*dst)));
+
            w--;
+           dst++;
        }
 
-       core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
-
-       /* Small tail */
-       dst += w & 0xfffc;
-       src += w & 0xfffc;
+       while (w >= 16)
+       {
+           save_128_aligned (
+               (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
 
-       w &= 3;
+           dst += 16;
+           w -= 16;
+       }
 
        while (w)
        {
-           t = (*dst) + (*src++);
-           *dst++ = t | (0 - (t >> 8));
+           *dst = (uint8_t)_mm_cvtsi128_si32 (
+               _mm_adds_epu8 (
+                   xmm_src,
+                   _mm_cvtsi32_si128 (*dst)));
+
+           w--;
+           dst++;
+       }
+    }
+
+}
+
+static void
+sse2_composite_add_8_8 (pixman_implementation_t *imp,
+                       pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint16_t t;
+
+    PIXMAN_IMAGE_GET_LINE (
+       src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       src = src_line;
+
+       dst_line += dst_stride;
+       src_line += src_stride;
+       w = width;
+
+       /* Small head */
+       while (w && (unsigned long)dst & 3)
+       {
+           t = (*dst) + (*src++);
+           *dst++ = t | (0 - (t >> 8));
+           w--;
+       }
+
+       sse2_combine_add_u (imp, op,
+                           (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
+
+       /* Small tail */
+       dst += w & 0xfffc;
+       src += w & 0xfffc;
+
+       w &= 3;
+
+       while (w)
+       {
+           t = (*dst) + (*src++);
+           *dst++ = t | (0 - (t >> 8));
            w--;
        }
     }
 
-    _mm_empty ();
 }
 
-/* ---------------------------------------------------------------------
- * composite_add_8888_8888
- */
 static void
 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
+                              pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t    *dst_line, *dst;
     uint32_t    *src_line, *src;
     int dst_stride, src_stride;
@@ -5237,7 +4431,7 @@ sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
     PIXMAN_IMAGE_GET_LINE (
        src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 
     while (height--)
     {
@@ -5246,16 +4440,11 @@ sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
        src = src_line;
        src_line += src_stride;
 
-       core_combine_add_u_sse2 (dst, src, NULL, width);
+       sse2_combine_add_u (imp, op, dst, src, NULL, width);
     }
 
-    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * sse2_composite_copy_area
- */
-
 static pixman_bool_t
 pixman_blt_sse2 (uint32_t *src_bits,
                  uint32_t *dst_bits,
@@ -5265,8 +4454,8 @@ pixman_blt_sse2 (uint32_t *src_bits,
                  int       dst_bpp,
                  int       src_x,
                  int       src_y,
-                 int       dst_x,
-                 int       dst_y,
+                 int       dest_x,
+                 int       dest_y,
                  int       width,
                  int       height)
 {
@@ -5282,7 +4471,7 @@ pixman_blt_sse2 (uint32_t *src_bits,
        src_stride = src_stride * (int) sizeof (uint32_t) / 2;
        dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
        src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
-       dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
+       dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
        byte_width = 2 * width;
        src_stride *= 2;
        dst_stride *= 2;
@@ -5292,7 +4481,7 @@ pixman_blt_sse2 (uint32_t *src_bits,
        src_stride = src_stride * (int) sizeof (uint32_t) / 4;
        dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
        src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
-       dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
+       dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
        byte_width = 4 * width;
        src_stride *= 4;
        dst_stride *= 4;
@@ -5302,9 +4491,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
        return FALSE;
     }
 
-    cache_prefetch ((__m128i*)src_bytes);
-    cache_prefetch ((__m128i*)dst_bytes);
-
     while (height--)
     {
        int w;
@@ -5314,9 +4500,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
        dst_bytes += dst_stride;
        w = byte_width;
 
-       cache_prefetch_next ((__m128i*)s);
-       cache_prefetch_next ((__m128i*)d);
-
        while (w >= 2 && ((unsigned long)d & 3))
        {
            *(uint16_t *)d = *(uint16_t *)s;
@@ -5334,17 +4517,10 @@ pixman_blt_sse2 (uint32_t *src_bits,
            d += 4;
        }
 
-       cache_prefetch_next ((__m128i*)s);
-       cache_prefetch_next ((__m128i*)d);
-
        while (w >= 64)
        {
            __m128i xmm0, xmm1, xmm2, xmm3;
 
-           /* 128 bytes ahead */
-           cache_prefetch (((__m128i*)s) + 8);
-           cache_prefetch (((__m128i*)d) + 8);
-
            xmm0 = load_128_unaligned ((__m128i*)(s));
            xmm1 = load_128_unaligned ((__m128i*)(s + 16));
            xmm2 = load_128_unaligned ((__m128i*)(s + 32));
@@ -5360,9 +4536,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
            w -= 64;
        }
 
-       cache_prefetch_next ((__m128i*)s);
-       cache_prefetch_next ((__m128i*)d);
-
        while (w >= 16)
        {
            save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
@@ -5372,9 +4545,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
            s += 16;
        }
 
-       cache_prefetch_next ((__m128i*)s);
-       cache_prefetch_next ((__m128i*)d);
-
        while (w >= 4)
        {
            *(uint32_t *)d = *(uint32_t *)s;
@@ -5393,63 +4563,43 @@ pixman_blt_sse2 (uint32_t *src_bits,
        }
     }
 
-    _mm_empty ();
 
     return TRUE;
 }
 
 static void
 sse2_composite_copy_area (pixman_implementation_t *imp,
-                          pixman_op_t              op,
-                          pixman_image_t *         src_image,
-                          pixman_image_t *         mask_image,
-                          pixman_image_t *         dst_image,
-                          int32_t                  src_x,
-                          int32_t                  src_y,
-                          int32_t                  mask_x,
-                          int32_t                  mask_y,
-                          int32_t                  dest_x,
-                          int32_t                  dest_y,
-                          int32_t                  width,
-                          int32_t                  height)
+                          pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     pixman_blt_sse2 (src_image->bits.bits,
-                     dst_image->bits.bits,
+                     dest_image->bits.bits,
                      src_image->bits.rowstride,
-                     dst_image->bits.rowstride,
+                     dest_image->bits.rowstride,
                      PIXMAN_FORMAT_BPP (src_image->bits.format),
-                     PIXMAN_FORMAT_BPP (dst_image->bits.format),
+                     PIXMAN_FORMAT_BPP (dest_image->bits.format),
                      src_x, src_y, dest_x, dest_y, width, height);
 }
 
 static void
 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 pixman_image_t *         src_image,
-                                 pixman_image_t *         mask_image,
-                                 pixman_image_t *         dst_image,
-                                 int32_t                  src_x,
-                                 int32_t                  src_y,
-                                 int32_t                  mask_x,
-                                 int32_t                  mask_y,
-                                 int32_t                  dest_x,
-                                 int32_t                  dest_y,
-                                 int32_t                  width,
-                                 int32_t                  height)
+                                 pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t    *src, *src_line, s;
     uint32_t    *dst, *dst_line, d;
     uint8_t         *mask, *mask_line;
     uint32_t m;
     int src_stride, mask_stride, dst_stride;
-    uint16_t w;
+    int32_t w;
+    __m128i ms;
 
     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
     PIXMAN_IMAGE_GET_LINE (
@@ -5466,45 +4616,30 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
 
         w = width;
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)src);
-        cache_prefetch ((__m128i*)dst);
-        cache_prefetch ((__m128i*)mask);
-
         while (w && (unsigned long)dst & 15)
         {
             s = 0xff000000 | *src++;
             m = (uint32_t) *mask++;
             d = *dst;
-
-            __m64 ms = unpack_32_1x64 (s);
+            ms = unpack_32_1x128 (s);
 
             if (m != 0xff)
             {
-               __m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
-               __m64 md = unpack_32_1x64 (d);
+               __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+               __m128i md = unpack_32_1x128 (d);
 
-                ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md);
+                ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
             }
 
-            *dst++ = pack_1x64_32 (ms);
+            *dst++ = pack_1x128_32 (ms);
             w--;
         }
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)src);
-        cache_prefetch ((__m128i*)dst);
-        cache_prefetch ((__m128i*)mask);
-
         while (w >= 4)
         {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i*)src);
-            cache_prefetch_next ((__m128i*)dst);
-            cache_prefetch_next ((__m128i*)mask);
-
             m = *(uint32_t*) mask;
-            xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
+            xmm_src = _mm_or_si128 (
+               load_128_unaligned ((__m128i*)src), mask_ff000000);
 
             if (m == 0xffffffff)
             {
@@ -5520,9 +4655,12 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
 
-                expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+                expand_alpha_rev_2x128 (
+                   xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
 
-                in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+                in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+                              &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
+                              &xmm_dst_lo, &xmm_dst_hi);
 
                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
             }
@@ -5547,15 +4685,15 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
                 }
                 else
                 {
-                   __m64 ma, md, ms;
+                   __m128i ma, md, ms;
 
                     d = *dst;
 
-                   ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
-                   md = unpack_32_1x64 (d);
-                   ms = unpack_32_1x64 (s);
+                   ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+                   md = unpack_32_1x128 (d);
+                   ms = unpack_32_1x128 (s);
 
-                    *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md));
+                    *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
                 }
 
             }
@@ -5566,261 +4704,1368 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
         }
     }
 
-    _mm_empty ();
 }
 
-static const pixman_fast_path_t sse2_fast_paths[] =
+static void
+sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
+                                 pixman_composite_info_t *info)
 {
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   sse2_composite_over_n_8_0565,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   sse2_composite_over_n_8_0565,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_over_n_8888,         0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_over_n_8888,         0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_r5g6b5,   sse2_composite_over_n_0565,         0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_over_8888_8888,      0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_over_8888_8888,      0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, sse2_composite_over_8888_8888,      0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_over_8888_8888,      0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   sse2_composite_over_8888_0565,      0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   sse2_composite_over_8888_0565,      0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_n_8_8888,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_n_8_8888,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_n_8_8888,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_n_8_8888,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_x888_8_8888,    0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888,    0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_x888_8_8888,    0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888,    0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_r5g6b5,   sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_b5g6r5,   sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_copy_area,           0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_copy_area,           0 },
-
-    { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_add_n_8888_8888_ca,  NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_ADD,  PIXMAN_a8,       PIXMAN_null,     PIXMAN_a8,       sse2_composite_add_8000_8000,       0 },
-    { PIXMAN_OP_ADD,  PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_add_8888_8888,       0 },
-    { PIXMAN_OP_ADD,  PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, sse2_composite_add_8888_8888,       0 },
-    { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8,       sse2_composite_add_n_8_8,           0 },
-
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_src_n_8_8888,        0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_src_n_8_8888,        0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_src_n_8_8888,        0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_src_n_8_8888,        0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_copy_area,           0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_a8b8g8r8, sse2_composite_copy_area,           0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_copy_area,           0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_copy_area,           0 },
-    { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_copy_area,           0 },
-    { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8,  PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_copy_area,           0 },
-    { PIXMAN_OP_SRC, PIXMAN_r5g6b5,    PIXMAN_null,     PIXMAN_r5g6b5,   sse2_composite_copy_area,           0 },
-    { PIXMAN_OP_SRC, PIXMAN_b5g6r5,    PIXMAN_null,     PIXMAN_b5g6r5,   sse2_composite_copy_area,           0 },
-
-    { PIXMAN_OP_IN,  PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       sse2_composite_in_8_8,              0 },
-    { PIXMAN_OP_IN,  PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8,       sse2_composite_in_n_8_8,            0 },
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *src, *src_line, s;
+    uint32_t    *dst, *dst_line, d;
+    uint8_t         *mask, *mask_line;
+    uint32_t m;
+    int src_stride, mask_stride, dst_stride;
+    int32_t w;
 
-    { PIXMAN_OP_NONE },
-};
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
 
-/*
- * Work around GCC bug causing crashes in Mozilla with SSE2
- *
- * When using -msse, gcc generates movdqa instructions assuming that
- * the stack is 16 byte aligned. Unfortunately some applications, such
- * as Mozilla and Mono, end up aligning the stack to 4 bytes, which
- * causes the movdqa instructions to fail.
- *
- * The __force_align_arg_pointer__ makes gcc generate a prologue that
- * realigns the stack pointer to 16 bytes.
- *
- * On x86-64 this is not necessary because the standard ABI already
- * calls for a 16 byte aligned stack.
- *
- * See https://bugs.freedesktop.org/show_bug.cgi?id=15693
- */
-#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
-__attribute__((__force_align_arg_pointer__))
-#endif
-static void
-sse2_composite (pixman_implementation_t *imp,
-                pixman_op_t              op,
-                pixman_image_t *         src,
-                pixman_image_t *         mask,
-                pixman_image_t *         dest,
-                int32_t                  src_x,
-                int32_t                  src_y,
-                int32_t                  mask_x,
-                int32_t                  mask_y,
-                int32_t                  dest_x,
-                int32_t                  dest_y,
-                int32_t                  width,
-                int32_t                  height)
-{
-    if (_pixman_run_fast_path (sse2_fast_paths, imp,
-                               op, src, mask, dest,
-                               src_x, src_y,
-                               mask_x, mask_y,
-                               dest_x, dest_y,
-                               width, height))
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
     {
-       return;
-    }
+        src = src_line;
+        src_line += src_stride;
+        dst = dst_line;
+        dst_line += dst_stride;
+        mask = mask_line;
+        mask_line += mask_stride;
 
-    _pixman_implementation_composite (imp->delegate, op,
-                                      src, mask, dest,
-                                      src_x, src_y,
-                                      mask_x, mask_y,
-                                      dest_x, dest_y,
-                                      width, height);
-}
+        w = width;
 
-#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
-__attribute__((__force_align_arg_pointer__))
-#endif
-static pixman_bool_t
-sse2_blt (pixman_implementation_t *imp,
-          uint32_t *               src_bits,
-          uint32_t *               dst_bits,
-          int                      src_stride,
-          int                      dst_stride,
-          int                      src_bpp,
-          int                      dst_bpp,
-          int                      src_x,
-          int                      src_y,
-          int                      dst_x,
-          int                      dst_y,
-          int                      width,
-          int                      height)
-{
-    if (!pixman_blt_sse2 (
-            src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
-            src_x, src_y, dst_x, dst_y, width, height))
+        while (w && (unsigned long)dst & 15)
+        {
+           uint32_t sa;
 
-    {
-       return _pixman_implementation_blt (
-           imp->delegate,
-           src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
-           src_x, src_y, dst_x, dst_y, width, height);
-    }
+            s = *src++;
+            m = (uint32_t) *mask++;
+            d = *dst;
 
-    return TRUE;
-}
+           sa = s >> 24;
 
-#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
-__attribute__((__force_align_arg_pointer__))
-#endif
-static pixman_bool_t
-sse2_fill (pixman_implementation_t *imp,
-           uint32_t *               bits,
-           int                      stride,
-           int                      bpp,
-           int                      x,
-           int                      y,
-           int                      width,
-           int                      height,
-           uint32_t xor)
-{
-    if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
-    {
-       return _pixman_implementation_fill (
-           imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+           if (m)
+           {
+               if (sa == 0xff && m == 0xff)
+               {
+                   *dst = s;
+               }
+               else
+               {
+                   __m128i ms, md, ma, msa;
+
+                   ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+                   ms = unpack_32_1x128 (s);
+                   md = unpack_32_1x128 (d);
+
+                   msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+                   *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+               }
+           }
+
+           dst++;
+            w--;
+        }
+
+        while (w >= 4)
+        {
+            m = *(uint32_t *) mask;
+
+           if (m)
+           {
+               xmm_src = load_128_unaligned ((__m128i*)src);
+
+               if (m == 0xffffffff && is_opaque (xmm_src))
+               {
+                   save_128_aligned ((__m128i *)dst, xmm_src);
+               }
+               else
+               {
+                   xmm_dst = load_128_aligned ((__m128i *)dst);
+
+                   xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
+
+                   unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+                   unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+                   unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+                   expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
+                   expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+                   in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
+                                  &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+                   save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+               }
+           }
+
+            src += 4;
+            dst += 4;
+            mask += 4;
+            w -= 4;
+        }
+
+        while (w)
+        {
+           uint32_t sa;
+
+            s = *src++;
+            m = (uint32_t) *mask++;
+            d = *dst;
+
+           sa = s >> 24;
+
+           if (m)
+           {
+               if (sa == 0xff && m == 0xff)
+               {
+                   *dst = s;
+               }
+               else
+               {
+                   __m128i ms, md, ma, msa;
+
+                   ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+                   ms = unpack_32_1x128 (s);
+                   md = unpack_32_1x128 (d);
+
+                   msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+                   *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+               }
+           }
+
+           dst++;
+            w--;
+        }
     }
 
-    return TRUE;
 }
 
-#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
-__attribute__((__force_align_arg_pointer__))
-#endif
-pixman_implementation_t *
-_pixman_implementation_create_sse2 (void)
+static void
+sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
+                                   pixman_composite_info_t *info)
 {
-    pixman_implementation_t *mmx = _pixman_implementation_create_mmx ();
-    pixman_implementation_t *imp = _pixman_implementation_create (mmx);
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line, *dst;
+    __m128i xmm_src;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_dsta_hi, xmm_dsta_lo;
+    int dst_stride;
+    int32_t w;
 
-    /* SSE2 constants */
-    mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
-    mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
-    mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
-    mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
-    mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
-    mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
-    mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
-    mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
-    mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
-    mask_0080 = create_mask_16_128 (0x0080);
-    mask_00ff = create_mask_16_128 (0x00ff);
-    mask_0101 = create_mask_16_128 (0x0101);
-    mask_ffff = create_mask_16_128 (0xffff);
-    mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
-    mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
-    /* MMX constants */
-    mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
-    mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
+    if (src == 0)
+       return;
 
-    mask_x0080 = create_mask_16_64 (0x0080);
-    mask_x00ff = create_mask_16_64 (0x00ff);
-    mask_x0101 = create_mask_16_64 (0x0101);
-    mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 
-    _mm_empty ();
+    xmm_src = expand_pixel_32_1x128 (src);
 
-    /* Set up function pointers */
+    while (height--)
+    {
+       dst = dst_line;
 
-    /* SSE code patch for fbcompose.c */
-    imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
-    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
-    imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
-    imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
-    imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
-    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
-    imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
-    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
-    imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
-    imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
+       dst_line += dst_stride;
+       w = width;
 
-    imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
+       while (w && (unsigned long)dst & 15)
+       {
+           __m128i vd;
 
-    imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
-    imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
-    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
-    imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
-    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
-    imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
-    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
-    imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
-    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
-    imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
-    imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
+           vd = unpack_32_1x128 (*dst);
 
-    imp->composite = sse2_composite;
-    imp->blt = sse2_blt;
-    imp->fill = sse2_fill;
+           *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
+                                             xmm_src));
+           w--;
+           dst++;
+       }
 
-    return imp;
-}
+       while (w >= 4)
+       {
+           __m128i tmp_lo, tmp_hi;
 
-#endif /* USE_SSE2 */
+           xmm_dst = load_128_aligned ((__m128i*)dst);
+
+           unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+           expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
+
+           tmp_lo = xmm_src;
+           tmp_hi = xmm_src;
+
+           over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+                       &xmm_dsta_lo, &xmm_dsta_hi,
+                       &tmp_lo, &tmp_hi);
+
+           save_128_aligned (
+               (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
+
+           w -= 4;
+           dst += 4;
+       }
+
+       while (w)
+       {
+           __m128i vd;
+
+           vd = unpack_32_1x128 (*dst);
+
+           *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
+                                             xmm_src));
+           w--;
+           dst++;
+       }
+
+    }
+
+}
+
+static void
+sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
+                                   pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *src, *src_line, s;
+    uint32_t    *dst, *dst_line, d;
+    uint32_t    *mask, *mask_line;
+    uint32_t    m;
+    int src_stride, mask_stride, dst_stride;
+    int32_t w;
+
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+        src = src_line;
+        src_line += src_stride;
+        dst = dst_line;
+        dst_line += dst_stride;
+        mask = mask_line;
+        mask_line += mask_stride;
+
+        w = width;
+
+        while (w && (unsigned long)dst & 15)
+        {
+           uint32_t sa;
+
+            s = *src++;
+            m = (*mask++) >> 24;
+            d = *dst;
+
+           sa = s >> 24;
+
+           if (m)
+           {
+               if (sa == 0xff && m == 0xff)
+               {
+                   *dst = s;
+               }
+               else
+               {
+                   __m128i ms, md, ma, msa;
+
+                   ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+                   ms = unpack_32_1x128 (s);
+                   md = unpack_32_1x128 (d);
+
+                   msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+                   *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+               }
+           }
+
+           dst++;
+            w--;
+        }
+
+        while (w >= 4)
+        {
+           xmm_mask = load_128_unaligned ((__m128i*)mask);
+
+           if (!is_transparent (xmm_mask))
+           {
+               xmm_src = load_128_unaligned ((__m128i*)src);
+
+               if (is_opaque (xmm_mask) && is_opaque (xmm_src))
+               {
+                   save_128_aligned ((__m128i *)dst, xmm_src);
+               }
+               else
+               {
+                   xmm_dst = load_128_aligned ((__m128i *)dst);
+
+                   unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+                   unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+                   unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+                   expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
+                   expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+                   in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
+                                  &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+                   save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+               }
+           }
+
+            src += 4;
+            dst += 4;
+            mask += 4;
+            w -= 4;
+        }
+
+        while (w)
+        {
+           uint32_t sa;
+
+            s = *src++;
+            m = (*mask++) >> 24;
+            d = *dst;
+
+           sa = s >> 24;
+
+           if (m)
+           {
+               if (sa == 0xff && m == 0xff)
+               {
+                   *dst = s;
+               }
+               else
+               {
+                   __m128i ms, md, ma, msa;
+
+                   ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+                   ms = unpack_32_1x128 (s);
+                   md = unpack_32_1x128 (d);
+
+                   msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+                   *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+               }
+           }
+
+           dst++;
+            w--;
+        }
+    }
+
+}
+
+/* A variant of 'sse2_combine_over_u' with minor tweaks */
+static force_inline void
+scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
+                                             const uint32_t* ps,
+                                             int32_t         w,
+                                             pixman_fixed_t  vx,
+                                             pixman_fixed_t  unit_x,
+                                             pixman_fixed_t  max_vx,
+                                             pixman_bool_t   fully_transparent_src)
+{
+    uint32_t s, d;
+    const uint32_t* pm = NULL;
+
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+    if (fully_transparent_src)
+       return;
+
+    /* Align dst on a 16-byte boundary */
+    while (w && ((unsigned long)pd & 15))
+    {
+       d = *pd;
+       s = combine1 (ps + (vx >> 16), pm);
+       vx += unit_x;
+
+       *pd++ = core_combine_over_u_pixel_sse2 (s, d);
+       if (pm)
+           pm++;
+       w--;
+    }
+
+    while (w >= 4)
+    {
+       __m128i tmp;
+       uint32_t tmp1, tmp2, tmp3, tmp4;
+
+       tmp1 = ps[vx >> 16];
+       vx += unit_x;
+       tmp2 = ps[vx >> 16];
+       vx += unit_x;
+       tmp3 = ps[vx >> 16];
+       vx += unit_x;
+       tmp4 = ps[vx >> 16];
+       vx += unit_x;
+
+       tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
+
+       xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
+
+       if (is_opaque (xmm_src_hi))
+       {
+           save_128_aligned ((__m128i*)pd, xmm_src_hi);
+       }
+       else if (!is_zero (xmm_src_hi))
+       {
+           xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+           unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+           unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+           expand_alpha_2x128 (
+               xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+
+           over_2x128 (&xmm_src_lo, &xmm_src_hi,
+                       &xmm_alpha_lo, &xmm_alpha_hi,
+                       &xmm_dst_lo, &xmm_dst_hi);
+
+           /* rebuid the 4 pixel data and save*/
+           save_128_aligned ((__m128i*)pd,
+                             pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+       }
+
+       w -= 4;
+       pd += 4;
+       if (pm)
+           pm += 4;
+    }
+
+    while (w)
+    {
+       d = *pd;
+       s = combine1 (ps + (vx >> 16), pm);
+       vx += unit_x;
+
+       *pd++ = core_combine_over_u_pixel_sse2 (s, d);
+       if (pm)
+           pm++;
+
+       w--;
+    }
+}
+
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
+                      scaled_nearest_scanline_sse2_8888_8888_OVER,
+                      uint32_t, uint32_t, COVER)
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
+                      scaled_nearest_scanline_sse2_8888_8888_OVER,
+                      uint32_t, uint32_t, NONE)
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
+                      scaled_nearest_scanline_sse2_8888_8888_OVER,
+                      uint32_t, uint32_t, PAD)
+
+static force_inline void
+scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
+                                              uint32_t *       dst,
+                                              const uint32_t * src,
+                                              int32_t          w,
+                                              pixman_fixed_t   vx,
+                                              pixman_fixed_t   unit_x,
+                                              pixman_fixed_t   max_vx,
+                                              pixman_bool_t    zero_src)
+{
+    __m128i xmm_mask;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+    if (zero_src || (*mask >> 24) == 0)
+       return;
+
+    xmm_mask = create_mask_16_128 (*mask >> 24);
+
+    while (w && (unsigned long)dst & 15)
+    {
+       uint32_t s = src[pixman_fixed_to_int (vx)];
+       vx += unit_x;
+
+       if (s)
+       {
+           uint32_t d = *dst;
+
+           __m128i ms = unpack_32_1x128 (s);
+           __m128i alpha     = expand_alpha_1x128 (ms);
+           __m128i dest      = xmm_mask;
+           __m128i alpha_dst = unpack_32_1x128 (d);
+
+           *dst = pack_1x128_32 (
+               in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
+       }
+       dst++;
+       w--;
+    }
+
+    while (w >= 4)
+    {
+       uint32_t tmp1, tmp2, tmp3, tmp4;
+
+       tmp1 = src[pixman_fixed_to_int (vx)];
+       vx += unit_x;
+       tmp2 = src[pixman_fixed_to_int (vx)];
+       vx += unit_x;
+       tmp3 = src[pixman_fixed_to_int (vx)];
+       vx += unit_x;
+       tmp4 = src[pixman_fixed_to_int (vx)];
+       vx += unit_x;
+
+       xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
+
+       if (!is_zero (xmm_src))
+       {
+           xmm_dst = load_128_aligned ((__m128i*)dst);
+
+           unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+           unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+           expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+                               &xmm_alpha_lo, &xmm_alpha_hi);
+
+           in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+                          &xmm_alpha_lo, &xmm_alpha_hi,
+                          &xmm_mask, &xmm_mask,
+                          &xmm_dst_lo, &xmm_dst_hi);
+
+           save_128_aligned (
+               (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+       }
+
+       dst += 4;
+       w -= 4;
+    }
+
+    while (w)
+    {
+       uint32_t s = src[pixman_fixed_to_int (vx)];
+       vx += unit_x;
+
+       if (s)
+       {
+           uint32_t d = *dst;
+
+           __m128i ms = unpack_32_1x128 (s);
+           __m128i alpha = expand_alpha_1x128 (ms);
+           __m128i mask  = xmm_mask;
+           __m128i dest  = unpack_32_1x128 (d);
+
+           *dst = pack_1x128_32 (
+               in_over_1x128 (&ms, &alpha, &mask, &dest));
+       }
+
+       dst++;
+       w--;
+    }
+
+}
+
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
+                             scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+                             uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
+                             scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+                             uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
+                             scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+                             uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
+
+#define BILINEAR_DECLARE_VARIABLES                                             \
+    const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);     \
+    const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);     \
+    const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);\
+    const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);           \
+    const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x,      \
+                                         unit_x, unit_x, unit_x, unit_x);      \
+    const __m128i xmm_zero = _mm_setzero_si128 ();                             \
+    __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx, vx, vx, vx, vx)
+
+#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)                                    \
+do {                                                                           \
+    __m128i xmm_wh, xmm_lo, xmm_hi, a;                                         \
+    /* fetch 2x2 pixel block into sse2 register */                             \
+    uint32_t tl = src_top [pixman_fixed_to_int (vx)];                          \
+    uint32_t tr = src_top [pixman_fixed_to_int (vx) + 1];                      \
+    uint32_t bl = src_bottom [pixman_fixed_to_int (vx)];                       \
+    uint32_t br = src_bottom [pixman_fixed_to_int (vx) + 1];                   \
+    a = _mm_set_epi32 (tr, tl, br, bl);                                                \
+    vx += unit_x;                                                              \
+    /* vertical interpolation */                                               \
+    a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpackhi_epi8 (a, xmm_zero),       \
+                                       xmm_wt),                                \
+                      _mm_mullo_epi16 (_mm_unpacklo_epi8 (a, xmm_zero),        \
+                                       xmm_wb));                               \
+    /* calculate horizontal weights */                                         \
+    xmm_wh = _mm_add_epi16 (xmm_addc,                                          \
+                           _mm_xor_si128 (xmm_xorc,                            \
+                                          _mm_srli_epi16 (xmm_x, 8)));         \
+    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);                                     \
+    /* horizontal interpolation */                                             \
+    xmm_lo = _mm_mullo_epi16 (a, xmm_wh);                                      \
+    xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);                                      \
+    a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),                    \
+                      _mm_unpackhi_epi16 (xmm_lo, xmm_hi));                    \
+    /* shift and pack the result */                                            \
+    a = _mm_srli_epi32 (a, 16);                                                        \
+    a = _mm_packs_epi32 (a, a);                                                        \
+    a = _mm_packus_epi16 (a, a);                                               \
+    pix = _mm_cvtsi128_si32 (a);                                               \
+} while (0)
+
+#define BILINEAR_SKIP_ONE_PIXEL()                                              \
+do {                                                                           \
+    vx += unit_x;                                                              \
+    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);                                     \
+} while(0)
+
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,
+                                            const uint32_t * mask,
+                                            const uint32_t * src_top,
+                                            const uint32_t * src_bottom,
+                                            int32_t          w,
+                                            int              wt,
+                                            int              wb,
+                                            pixman_fixed_t   vx,
+                                            pixman_fixed_t   unit_x,
+                                            pixman_fixed_t   max_vx,
+                                            pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    uint32_t pix1, pix2, pix3, pix4;
+
+    while ((w -= 4) >= 0)
+    {
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
+       *dst++ = pix1;
+       *dst++ = pix2;
+       *dst++ = pix3;
+       *dst++ = pix4;
+    }
+
+    if (w & 2)
+    {
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+       *dst++ = pix1;
+       *dst++ = pix2;
+    }
+
+    if (w & 1)
+    {
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+       *dst = pix1;
+    }
+
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
+                              scaled_bilinear_scanline_sse2_8888_8888_SRC,
+                              uint32_t, uint32_t, uint32_t,
+                              COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
+                              scaled_bilinear_scanline_sse2_8888_8888_SRC,
+                              uint32_t, uint32_t, uint32_t,
+                              PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
+                              scaled_bilinear_scanline_sse2_8888_8888_SRC,
+                              uint32_t, uint32_t, uint32_t,
+                              NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
+                              scaled_bilinear_scanline_sse2_8888_8888_SRC,
+                              uint32_t, uint32_t, uint32_t,
+                              NORMAL, FLAG_NONE)
+
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,
+                                             const uint32_t * mask,
+                                             const uint32_t * src_top,
+                                             const uint32_t * src_bottom,
+                                             int32_t          w,
+                                             int              wt,
+                                             int              wb,
+                                             pixman_fixed_t   vx,
+                                             pixman_fixed_t   unit_x,
+                                             pixman_fixed_t   max_vx,
+                                             pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    uint32_t pix1, pix2, pix3, pix4;
+
+    while (w && ((unsigned long)dst & 15))
+    {
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+
+       if (pix1)
+       {
+           pix2 = *dst;
+           *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
+       }
+
+       w--;
+       dst++;
+    }
+
+    while (w  >= 4)
+    {
+       __m128i xmm_src;
+       __m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
+       __m128i xmm_alpha_hi, xmm_alpha_lo;
+
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
+
+       xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
+
+       if (!is_zero (xmm_src))
+       {
+           if (is_opaque (xmm_src))
+           {
+               save_128_aligned ((__m128i *)dst, xmm_src);
+           }
+           else
+           {
+               __m128i xmm_dst = load_128_aligned ((__m128i *)dst);
+
+               unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+               unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+               expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+               over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi,
+                           &xmm_dst_lo, &xmm_dst_hi);
+
+               save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+           }
+       }
+
+       w -= 4;
+       dst += 4;
+    }
+
+    while (w)
+    {
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+
+       if (pix1)
+       {
+           pix2 = *dst;
+           *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
+       }
+
+       w--;
+       dst++;
+    }
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
+                              scaled_bilinear_scanline_sse2_8888_8888_OVER,
+                              uint32_t, uint32_t, uint32_t,
+                              COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
+                              scaled_bilinear_scanline_sse2_8888_8888_OVER,
+                              uint32_t, uint32_t, uint32_t,
+                              PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
+                              scaled_bilinear_scanline_sse2_8888_8888_OVER,
+                              uint32_t, uint32_t, uint32_t,
+                              NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
+                              scaled_bilinear_scanline_sse2_8888_8888_OVER,
+                              uint32_t, uint32_t, uint32_t,
+                              NORMAL, FLAG_NONE)
+
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,
+                                               const uint8_t  * mask,
+                                               const uint32_t * src_top,
+                                               const uint32_t * src_bottom,
+                                               int32_t          w,
+                                               int              wt,
+                                               int              wb,
+                                               pixman_fixed_t   vx,
+                                               pixman_fixed_t   unit_x,
+                                               pixman_fixed_t   max_vx,
+                                               pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    uint32_t pix1, pix2, pix3, pix4;
+    uint32_t m;
+
+    while (w && ((unsigned long)dst & 15))
+    {
+       uint32_t sa;
+
+       m = (uint32_t) *mask++;
+
+       if (m)
+       {
+           BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+           sa = pix1 >> 24;
+
+           if (sa == 0xff && m == 0xff)
+           {
+               *dst = pix1;
+           }
+           else
+           {
+               __m128i ms, md, ma, msa;
+
+               pix2 = *dst;
+               ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+               ms = unpack_32_1x128 (pix1);
+               md = unpack_32_1x128 (pix2);
+
+               msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+               *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+           }
+       }
+       else
+       {
+           BILINEAR_SKIP_ONE_PIXEL ();
+       }
+
+       w--;
+       dst++;
+    }
+
+    while (w >= 4)
+    {
+       __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
+       __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+       __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+       m = *(uint32_t*)mask;
+
+       if (m)
+       {
+           BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+           BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+           BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
+           BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
+
+           xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
+
+           if (m == 0xffffffff && is_opaque (xmm_src))
+           {
+               save_128_aligned ((__m128i *)dst, xmm_src);
+           }
+           else
+           {
+               xmm_dst = load_128_aligned ((__m128i *)dst);
+
+               xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
+
+               unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+               unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+               unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+               expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
+               expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+               in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
+                              &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+               save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+           }
+       }
+       else
+       {
+           BILINEAR_SKIP_ONE_PIXEL ();
+           BILINEAR_SKIP_ONE_PIXEL ();
+           BILINEAR_SKIP_ONE_PIXEL ();
+           BILINEAR_SKIP_ONE_PIXEL ();
+       }
+
+       w -= 4;
+       dst += 4;
+       mask += 4;
+    }
+
+    while (w)
+    {
+       uint32_t sa;
+
+       m = (uint32_t) *mask++;
+
+       if (m)
+       {
+           BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+           sa = pix1 >> 24;
+
+           if (sa == 0xff && m == 0xff)
+           {
+               *dst = pix1;
+           }
+           else
+           {
+               __m128i ms, md, ma, msa;
+
+               pix2 = *dst;
+               ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+               ms = unpack_32_1x128 (pix1);
+               md = unpack_32_1x128 (pix2);
+
+               msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+               *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+           }
+       }
+       else
+       {
+           BILINEAR_SKIP_ONE_PIXEL ();
+       }
+
+       w--;
+       dst++;
+    }
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
+                              scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+                              uint32_t, uint8_t, uint32_t,
+                              COVER, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
+                              scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+                              uint32_t, uint8_t, uint32_t,
+                              PAD, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
+                              scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+                              uint32_t, uint8_t, uint32_t,
+                              NONE, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
+                              scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+                              uint32_t, uint8_t, uint32_t,
+                              NORMAL, FLAG_HAVE_NON_SOLID_MASK)
+
+static const pixman_fast_path_t sse2_fast_paths[] =
+{
+    /* PIXMAN_OP_OVER */
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
+    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+    
+    /* PIXMAN_OP_OVER_REVERSE */
+    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
+
+    /* PIXMAN_OP_ADD */
+    PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
+    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
+    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
+
+    /* PIXMAN_OP_SRC */
+    PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
+
+    /* PIXMAN_OP_IN */
+    PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
+    PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
+    PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
+
+    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
+
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
+
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888),
+
+    { PIXMAN_OP_NONE },
+};
+
+static pixman_bool_t
+sse2_blt (pixman_implementation_t *imp,
+          uint32_t *               src_bits,
+          uint32_t *               dst_bits,
+          int                      src_stride,
+          int                      dst_stride,
+          int                      src_bpp,
+          int                      dst_bpp,
+          int                      src_x,
+          int                      src_y,
+          int                      dest_x,
+          int                      dest_y,
+          int                      width,
+          int                      height)
+{
+    if (!pixman_blt_sse2 (
+            src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+            src_x, src_y, dest_x, dest_y, width, height))
+
+    {
+       return _pixman_implementation_blt (
+           imp->delegate,
+           src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+           src_x, src_y, dest_x, dest_y, width, height);
+    }
+
+    return TRUE;
+}
+
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
+static pixman_bool_t
+sse2_fill (pixman_implementation_t *imp,
+           uint32_t *               bits,
+           int                      stride,
+           int                      bpp,
+           int                      x,
+           int                      y,
+           int                      width,
+           int                      height,
+           uint32_t xor)
+{
+    if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
+    {
+       return _pixman_implementation_fill (
+           imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+    }
+
+    return TRUE;
+}
+
+static uint32_t *
+sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    __m128i ff000000 = mask_ff000000;
+    uint32_t *dst = iter->buffer;
+    uint32_t *src = (uint32_t *)iter->bits;
+
+    iter->bits += iter->stride;
+
+    while (w && ((unsigned long)dst) & 0x0f)
+    {
+       *dst++ = (*src++) | 0xff000000;
+       w--;
+    }
+
+    while (w >= 4)
+    {
+       save_128_aligned (
+           (__m128i *)dst, _mm_or_si128 (
+               load_128_unaligned ((__m128i *)src), ff000000));
+
+       dst += 4;
+       src += 4;
+       w -= 4;
+    }
+
+    while (w)
+    {
+       *dst++ = (*src++) | 0xff000000;
+       w--;
+    }
+
+    return iter->buffer;
+}
+
+static uint32_t *
+sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    uint32_t *dst = iter->buffer;
+    uint16_t *src = (uint16_t *)iter->bits;
+    __m128i ff000000 = mask_ff000000;
+
+    iter->bits += iter->stride;
+
+    while (w && ((unsigned long)dst) & 0x0f)
+    {
+       uint16_t s = *src++;
+
+       *dst++ = CONVERT_0565_TO_8888 (s);
+       w--;
+    }
+
+    while (w >= 8)
+    {
+       __m128i lo, hi, s;
+
+       s = _mm_loadu_si128 ((__m128i *)src);
+
+       lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
+       hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
+
+       save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
+       save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
+
+       dst += 8;
+       src += 8;
+       w -= 8;
+    }
+
+    while (w)
+    {
+       uint16_t s = *src++;
+
+       *dst++ = CONVERT_0565_TO_8888 (s);
+       w--;
+    }
+
+    return iter->buffer;
+}
+
+static uint32_t *
+sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    uint32_t *dst = iter->buffer;
+    uint8_t *src = iter->bits;
+    __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
+
+    iter->bits += iter->stride;
+
+    while (w && (((unsigned long)dst) & 15))
+    {
+        *dst++ = *(src++) << 24;
+        w--;
+    }
+
+    while (w >= 16)
+    {
+       xmm0 = _mm_loadu_si128((__m128i *)src);
+
+       xmm1 = _mm_unpacklo_epi8  (_mm_setzero_si128(), xmm0);
+       xmm2 = _mm_unpackhi_epi8  (_mm_setzero_si128(), xmm0);
+       xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
+       xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
+       xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
+       xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
+
+       _mm_store_si128(((__m128i *)(dst +  0)), xmm3);
+       _mm_store_si128(((__m128i *)(dst +  4)), xmm4);
+       _mm_store_si128(((__m128i *)(dst +  8)), xmm5);
+       _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
+
+       dst += 16;
+       src += 16;
+       w -= 16;
+    }
+
+    while (w)
+    {
+       *dst++ = *(src++) << 24;
+       w--;
+    }
+
+    return iter->buffer;
+}
+
+typedef struct
+{
+    pixman_format_code_t       format;
+    pixman_iter_get_scanline_t get_scanline;
+} fetcher_info_t;
+
+static const fetcher_info_t fetchers[] =
+{
+    { PIXMAN_x8r8g8b8,         sse2_fetch_x8r8g8b8 },
+    { PIXMAN_r5g6b5,           sse2_fetch_r5g6b5 },
+    { PIXMAN_a8,               sse2_fetch_a8 },
+    { PIXMAN_null }
+};
+
+static void
+sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+    pixman_image_t *image = iter->image;
+    int x = iter->x;
+    int y = iter->y;
+    int width = iter->width;
+    int height = iter->height;
+
+#define FLAGS                                                          \
+    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | FAST_PATH_BITS_IMAGE)
+
+    if ((iter->flags & ITER_NARROW)                            &&
+       (image->common.flags & FLAGS) == FLAGS                  &&
+       x >= 0 && y >= 0                                        &&
+       x + width <= image->bits.width                          &&
+       y + height <= image->bits.height)
+    {
+       const fetcher_info_t *f;
+
+       for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
+       {
+           if (image->common.extended_format_code == f->format)
+           {
+               uint8_t *b = (uint8_t *)image->bits.bits;
+               int s = image->bits.rowstride * 4;
+
+               iter->bits = b + s * iter->y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
+               iter->stride = s;
+
+               iter->get_scanline = f->get_scanline;
+               return;
+           }
+       }
+    }
+
+    imp->delegate->src_iter_init (imp->delegate, iter);
+}
+
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
+pixman_implementation_t *
+_pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
+
+    /* SSE2 constants */
+    mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
+    mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
+    mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
+    mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
+    mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
+    mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
+    mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
+    mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
+    mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
+    mask_0080 = create_mask_16_128 (0x0080);
+    mask_00ff = create_mask_16_128 (0x00ff);
+    mask_0101 = create_mask_16_128 (0x0101);
+    mask_ffff = create_mask_16_128 (0xffff);
+    mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
+    mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
+
+    /* Set up function pointers */
+    imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
+    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
+    imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
+    imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
+    imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
+    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
+    imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
+    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
+    imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
+    imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
+
+    imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
+
+    imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
+    imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
+
+    imp->blt = sse2_blt;
+    imp->fill = sse2_fill;
+
+    imp->src_iter_init = sse2_src_iter_init;
+
+    return imp;
+}