mmx: add add_0565_0565
[profile/ivi/pixman.git] / pixman / pixman-sse2.c
index 283e4c4..e217ca3 100644 (file)
 #include <config.h>
 #endif
 
-#include <mmintrin.h>
 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
 #include <emmintrin.h> /* for SSE2 intrinsics */
 #include "pixman-private.h"
 #include "pixman-combine32.h"
-#include "pixman-fast-path.h"
-
-#if defined(_MSC_VER) && defined(_M_AMD64)
-/* Windows 64 doesn't allow MMX to be used, so
- * the pixman-x64-mmx-emulation.h file contains
- * implementations of those MMX intrinsics that
- * are used in the SSE2 implementation.
- */
-#   include "pixman-x64-mmx-emulation.h"
-#endif
-
-#ifdef USE_SSE2
-
-/* --------------------------------------------------------------------
- * Locals
- */
-
-static __m64 mask_x0080;
-static __m64 mask_x00ff;
-static __m64 mask_x0101;
-static __m64 mask_x_alpha;
-
-static __m64 mask_x565_rgb;
-static __m64 mask_x565_unpack;
+#include "pixman-inlines.h"
 
 static __m128i mask_0080;
 static __m128i mask_00ff;
@@ -77,9 +53,6 @@ static __m128i mask_blue;
 static __m128i mask_565_fix_rb;
 static __m128i mask_565_fix_g;
 
-/* ----------------------------------------------------------------------
- * SSE2 Inlines
- */
 static force_inline __m128i
 unpack_32_1x128 (uint32_t data)
 {
@@ -397,53 +370,18 @@ save_128_unaligned (__m128i* dst,
     _mm_storeu_si128 (dst, data);
 }
 
-/* ------------------------------------------------------------------
- * MMX inlines
- */
-
-static force_inline __m64
-load_32_1x64 (uint32_t data)
-{
-    return _mm_cvtsi32_si64 (data);
-}
-
 static force_inline __m128i
 load_32_1x128 (uint32_t data)
 {
     return _mm_cvtsi32_si128 (data);
 }
 
-static force_inline __m64
-unpack_32_1x64 (uint32_t data)
-{
-    return _mm_unpacklo_pi8 (load_32_1x64 (data), _mm_setzero_si64 ());
-}
-
-static force_inline __m64
-expand_alpha_1x64 (__m64 data)
-{
-    return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
-}
-
-static force_inline __m64
-expand_alpha_rev_1x64 (__m64 data)
-{
-    return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
-}
-
 static force_inline __m128i
 expand_alpha_rev_1x128 (__m128i data)
 {
     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
 }
 
-static force_inline __m64
-expand_pixel_8_1x64 (uint8_t data)
-{
-    return _mm_shuffle_pi16 (
-       unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
-}
-
 static force_inline __m128i
 expand_pixel_8_1x128 (uint8_t data)
 {
@@ -451,15 +389,6 @@ expand_pixel_8_1x128 (uint8_t data)
        unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
 }
 
-static force_inline __m64
-pix_multiply_1x64 (__m64 data,
-                   __m64 alpha)
-{
-    return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
-                                          mask_x0080),
-                           mask_x0101);
-}
-
 static force_inline __m128i
 pix_multiply_1x128 (__m128i data,
                    __m128i alpha)
@@ -469,18 +398,6 @@ pix_multiply_1x128 (__m128i data,
                            mask_0101);
 }
 
-static force_inline __m64
-pix_add_multiply_1x64 (__m64* src,
-                       __m64* alpha_dst,
-                       __m64* dst,
-                       __m64* alpha_src)
-{
-    __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
-    __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
-
-    return _mm_adds_pu8 (t1, t2);
-}
-
 static force_inline __m128i
 pix_add_multiply_1x128 (__m128i* src,
                        __m128i* alpha_dst,
@@ -493,50 +410,24 @@ pix_add_multiply_1x128 (__m128i* src,
     return _mm_adds_epu8 (t1, t2);
 }
 
-static force_inline __m64
-negate_1x64 (__m64 data)
-{
-    return _mm_xor_si64 (data, mask_x00ff);
-}
-
 static force_inline __m128i
 negate_1x128 (__m128i data)
 {
     return _mm_xor_si128 (data, mask_00ff);
 }
 
-static force_inline __m64
-invert_colors_1x64 (__m64 data)
-{
-    return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
-}
-
 static force_inline __m128i
 invert_colors_1x128 (__m128i data)
 {
     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
 }
 
-static force_inline __m64
-over_1x64 (__m64 src, __m64 alpha, __m64 dst)
-{
-    return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
-}
-
 static force_inline __m128i
 over_1x128 (__m128i src, __m128i alpha, __m128i dst)
 {
     return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
 }
 
-static force_inline __m64
-in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
-{
-    return over_1x64 (pix_multiply_1x64 (*src, *mask),
-                      pix_multiply_1x64 (*alpha, *mask),
-                      *dst);
-}
-
 static force_inline __m128i
 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
 {
@@ -545,17 +436,6 @@ in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
                       *dst);
 }
 
-static force_inline __m64
-over_rev_non_pre_1x64 (__m64 src, __m64 dst)
-{
-    __m64 alpha = expand_alpha_1x64 (src);
-
-    return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
-                                         _mm_or_si64 (alpha, mask_x_alpha)),
-                      alpha,
-                      dst);
-}
-
 static force_inline __m128i
 over_rev_non_pre_1x128 (__m128i src, __m128i dst)
 {
@@ -568,50 +448,11 @@ over_rev_non_pre_1x128 (__m128i src, __m128i dst)
 }
 
 static force_inline uint32_t
-pack_1x64_32 (__m64 data)
-{
-    return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
-}
-
-static force_inline uint32_t
 pack_1x128_32 (__m128i data)
 {
     return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
 }
 
-/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
- *
- *    00RR00GG00BB
- *
- * --- Expanding 565 in the low word ---
- *
- * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
- * m = m & (01f0003f001f);
- * m = m * (008404100840);
- * m = m >> 8;
- *
- * Note the trick here - the top word is shifted by another nibble to
- * avoid it bumping into the middle word
- */
-static force_inline __m64
-expand565_16_1x64 (uint16_t pixel)
-{
-    __m64 p;
-    __m64 t1, t2;
-
-    p = _mm_cvtsi32_si64 ((uint32_t) pixel);
-
-    t1 = _mm_slli_si64 (p, 36 - 11);
-    t2 = _mm_slli_si64 (p, 16 - 5);
-
-    p = _mm_or_si64 (t1, p);
-    p = _mm_or_si64 (t2, p);
-    p = _mm_and_si64 (p, mask_x565_rgb);
-    p = _mm_mullo_pi16 (p, mask_x565_unpack);
-
-    return _mm_srli_pi16 (p, 8);
-}
-
 static force_inline __m128i
 expand565_16_1x128 (uint16_t pixel)
 {
@@ -622,9 +463,6 @@ expand565_16_1x128 (uint16_t pixel)
     return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
 }
 
-/* ----------------------------------------------------------------------------
- * Compose Core transformations
- */
 static force_inline uint32_t
 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
 {
@@ -860,10 +698,12 @@ core_combine_over_u_sse2_no_mask (uint32_t *        pd,
 }
 
 static force_inline void
-core_combine_over_u_sse2 (uint32_t*       pd,
-                          const uint32_t* ps,
-                          const uint32_t* pm,
-                          int             w)
+sse2_combine_over_u (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
 {
     if (pm)
        core_combine_over_u_sse2_mask (pd, ps, pm, w);
@@ -871,11 +711,13 @@ core_combine_over_u_sse2 (uint32_t*       pd,
        core_combine_over_u_sse2_no_mask (pd, ps, w);
 }
 
-static force_inline void
-core_combine_over_reverse_u_sse2 (uint32_t*       pd,
-                                  const uint32_t* ps,
-                                  const uint32_t* pm,
-                                  int             w)
+static void
+sse2_combine_over_reverse_u (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               pd,
+                             const uint32_t *         ps,
+                             const uint32_t *         pm,
+                             int                      w)
 {
     uint32_t s, d;
 
@@ -959,11 +801,13 @@ core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
     return dst;
 }
 
-static force_inline void
-core_combine_in_u_sse2 (uint32_t*       pd,
-                        const uint32_t* ps,
-                        const uint32_t* pm,
-                        int             w)
+static void
+sse2_combine_in_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               pd,
+                   const uint32_t *         ps,
+                   const uint32_t *         pm,
+                   int                      w)
 {
     uint32_t s, d;
 
@@ -1018,11 +862,13 @@ core_combine_in_u_sse2 (uint32_t*       pd,
     }
 }
 
-static force_inline void
-core_combine_reverse_in_u_sse2 (uint32_t*       pd,
-                                const uint32_t* ps,
-                                const uint32_t *pm,
-                                int             w)
+static void
+sse2_combine_in_reverse_u (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               pd,
+                           const uint32_t *         ps,
+                           const uint32_t *         pm,
+                           int                      w)
 {
     uint32_t s, d;
 
@@ -1077,11 +923,13 @@ core_combine_reverse_in_u_sse2 (uint32_t*       pd,
     }
 }
 
-static force_inline void
-core_combine_reverse_out_u_sse2 (uint32_t*       pd,
-                                 const uint32_t* ps,
-                                 const uint32_t* pm,
-                                 int             w)
+static void
+sse2_combine_out_reverse_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               pd,
+                            const uint32_t *         ps,
+                            const uint32_t *         pm,
+                            int                      w)
 {
     while (w && ((unsigned long) pd & 15))
     {
@@ -1144,11 +992,13 @@ core_combine_reverse_out_u_sse2 (uint32_t*       pd,
     }
 }
 
-static force_inline void
-core_combine_out_u_sse2 (uint32_t*       pd,
-                         const uint32_t* ps,
-                         const uint32_t* pm,
-                         int             w)
+static void
+sse2_combine_out_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               pd,
+                    const uint32_t *         ps,
+                    const uint32_t *         pm,
+                    int                      w)
 {
     while (w && ((unsigned long) pd & 15))
     {
@@ -1222,11 +1072,13 @@ core_combine_atop_u_pixel_sse2 (uint32_t src,
     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
 }
 
-static force_inline void
-core_combine_atop_u_sse2 (uint32_t*       pd,
-                          const uint32_t* ps,
-                          const uint32_t* pm,
-                          int             w)
+static void
+sse2_combine_atop_u (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
 {
     uint32_t s, d;
 
@@ -1304,11 +1156,13 @@ core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
 }
 
-static force_inline void
-core_combine_reverse_atop_u_sse2 (uint32_t*       pd,
-                                  const uint32_t* ps,
-                                  const uint32_t* pm,
-                                  int             w)
+static void
+sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               pd,
+                             const uint32_t *         ps,
+                             const uint32_t *         pm,
+                             int                      w)
 {
     uint32_t s, d;
 
@@ -1386,11 +1240,13 @@ core_combine_xor_u_pixel_sse2 (uint32_t src,
     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
 }
 
-static force_inline void
-core_combine_xor_u_sse2 (uint32_t*       dst,
-                         const uint32_t* src,
-                         const uint32_t *mask,
-                         int             width)
+static void
+sse2_combine_xor_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dst,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
 {
     int w = width;
     uint32_t s, d;
@@ -1462,10 +1318,12 @@ core_combine_xor_u_sse2 (uint32_t*       dst,
 }
 
 static force_inline void
-core_combine_add_u_sse2 (uint32_t*       dst,
-                         const uint32_t* src,
-                         const uint32_t* mask,
-                         int             width)
+sse2_combine_add_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dst,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
 {
     int w = width;
     uint32_t s, d;
@@ -1533,11 +1391,13 @@ core_combine_saturate_u_pixel_sse2 (uint32_t src,
     return pack_1x128_32 (_mm_adds_epu16 (md, ms));
 }
 
-static force_inline void
-core_combine_saturate_u_sse2 (uint32_t *      pd,
-                              const uint32_t *ps,
-                              const uint32_t *pm,
-                              int             w)
+static void
+sse2_combine_saturate_u (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         uint32_t *               pd,
+                         const uint32_t *         ps,
+                         const uint32_t *         pm,
+                         int                      w)
 {
     uint32_t s, d;
 
@@ -1618,11 +1478,13 @@ core_combine_saturate_u_sse2 (uint32_t *      pd,
     }
 }
 
-static force_inline void
-core_combine_src_ca_sse2 (uint32_t*       pd,
-                          const uint32_t* ps,
-                          const uint32_t *pm,
-                          int             w)
+static void
+sse2_combine_src_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
 {
     uint32_t s, m;
 
@@ -1683,11 +1545,13 @@ core_combine_over_ca_pixel_sse2 (uint32_t src,
     return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
 }
 
-static force_inline void
-core_combine_over_ca_sse2 (uint32_t*       pd,
-                           const uint32_t* ps,
-                           const uint32_t *pm,
-                           int             w)
+static void
+sse2_combine_over_ca (pixman_implementation_t *imp,
+                      pixman_op_t              op,
+                      uint32_t *               pd,
+                      const uint32_t *         ps,
+                      const uint32_t *         pm,
+                      int                      w)
 {
     uint32_t s, m, d;
 
@@ -1757,11 +1621,13 @@ core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
                                        unpack_32_1x128 (mask))));
 }
 
-static force_inline void
-core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
-                                   const uint32_t* ps,
-                                   const uint32_t *pm,
-                                   int             w)
+static void
+sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              uint32_t *               pd,
+                              const uint32_t *         ps,
+                              const uint32_t *         pm,
+                              int                      w)
 {
     uint32_t s, m, d;
 
@@ -1820,11 +1686,13 @@ core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
     }
 }
 
-static force_inline void
-core_combine_in_ca_sse2 (uint32_t *      pd,
-                         const uint32_t *ps,
-                         const uint32_t *pm,
-                         int             w)
+static void
+sse2_combine_in_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               pd,
+                    const uint32_t *         ps,
+                    const uint32_t *         pm,
+                    int                      w)
 {
     uint32_t s, m, d;
 
@@ -1893,11 +1761,13 @@ core_combine_in_ca_sse2 (uint32_t *      pd,
     }
 }
 
-static force_inline void
-core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
-                                 const uint32_t *ps,
-                                 const uint32_t *pm,
-                                 int             w)
+static void
+sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               pd,
+                            const uint32_t *         ps,
+                            const uint32_t *         pm,
+                            int                      w)
 {
     uint32_t s, m, d;
 
@@ -1964,11 +1834,13 @@ core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
     }
 }
 
-static force_inline void
-core_combine_out_ca_sse2 (uint32_t *      pd,
-                          const uint32_t *ps,
-                          const uint32_t *pm,
-                          int             w)
+static void
+sse2_combine_out_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
 {
     uint32_t s, m, d;
 
@@ -2038,11 +1910,13 @@ core_combine_out_ca_sse2 (uint32_t *      pd,
     }
 }
 
-static force_inline void
-core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
-                                  const uint32_t *ps,
-                                  const uint32_t *pm,
-                                  int             w)
+static void
+sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               pd,
+                             const uint32_t *         ps,
+                             const uint32_t *         pm,
+                             int                      w)
 {
     uint32_t s, m, d;
 
@@ -2132,11 +2006,13 @@ core_combine_atop_ca_pixel_sse2 (uint32_t src,
     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
 }
 
-static force_inline void
-core_combine_atop_ca_sse2 (uint32_t *      pd,
-                           const uint32_t *ps,
-                           const uint32_t *pm,
-                           int             w)
+static void
+sse2_combine_atop_ca (pixman_implementation_t *imp,
+                      pixman_op_t              op,
+                      uint32_t *               pd,
+                      const uint32_t *         ps,
+                      const uint32_t *         pm,
+                      int                      w)
 {
     uint32_t s, m, d;
 
@@ -2223,11 +2099,13 @@ core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
 }
 
-static force_inline void
-core_combine_reverse_atop_ca_sse2 (uint32_t *      pd,
-                                   const uint32_t *ps,
-                                   const uint32_t *pm,
-                                   int             w)
+static void
+sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              uint32_t *               pd,
+                              const uint32_t *         ps,
+                              const uint32_t *         pm,
+                              int                      w)
 {
     uint32_t s, m, d;
 
@@ -2317,11 +2195,13 @@ core_combine_xor_ca_pixel_sse2 (uint32_t src,
                                                 &alpha_src));
 }
 
-static force_inline void
-core_combine_xor_ca_sse2 (uint32_t *      pd,
-                          const uint32_t *ps,
-                          const uint32_t *pm,
-                          int             w)
+static void
+sse2_combine_xor_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
 {
     uint32_t s, m, d;
 
@@ -2393,11 +2273,13 @@ core_combine_xor_ca_sse2 (uint32_t *      pd,
     }
 }
 
-static force_inline void
-core_combine_add_ca_sse2 (uint32_t *      pd,
-                          const uint32_t *ps,
-                          const uint32_t *pm,
-                          int             w)
+static void
+sse2_combine_add_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
 {
     uint32_t s, m, d;
 
@@ -2432,351 +2314,55 @@ core_combine_add_ca_sse2 (uint32_t *      pd,
                            &xmm_mask_lo, &xmm_mask_hi,
                            &xmm_src_lo, &xmm_src_hi);
 
-       save_128_aligned (
-           (__m128i*)pd, pack_2x128_128 (
-               _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
-               _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
-
-       ps += 4;
-       pd += 4;
-       pm += 4;
-       w -= 4;
-    }
-
-    while (w)
-    {
-       s = *ps++;
-       m = *pm++;
-       d = *pd;
-
-       *pd++ = pack_1x128_32 (
-           _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
-                                              unpack_32_1x128 (m)),
-                          unpack_32_1x128 (d)));
-       w--;
-    }
-}
-
-/* ---------------------------------------------------
- * fb_compose_setup_sSE2
- */
-static force_inline __m64
-create_mask_16_64 (uint16_t mask)
-{
-    return _mm_set1_pi16 (mask);
-}
-
-static force_inline __m128i
-create_mask_16_128 (uint16_t mask)
-{
-    return _mm_set1_epi16 (mask);
-}
-
-static force_inline __m64
-create_mask_2x32_64 (uint32_t mask0,
-                     uint32_t mask1)
-{
-    return _mm_set_pi32 (mask0, mask1);
-}
-
-/* Work around a code generation bug in Sun Studio 12. */
-#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
-# define create_mask_2x32_128(mask0, mask1)                            \
-    (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
-#else
-static force_inline __m128i
-create_mask_2x32_128 (uint32_t mask0,
-                      uint32_t mask1)
-{
-    return _mm_set_epi32 (mask0, mask1, mask0, mask1);
-}
-#endif
-
-/* SSE2 code patch for fbcompose.c */
-
-static void
-sse2_combine_over_u (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     uint32_t *               dst,
-                     const uint32_t *         src,
-                     const uint32_t *         mask,
-                     int                      width)
-{
-    core_combine_over_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_over_reverse_u (pixman_implementation_t *imp,
-                             pixman_op_t              op,
-                             uint32_t *               dst,
-                             const uint32_t *         src,
-                             const uint32_t *         mask,
-                             int                      width)
-{
-    core_combine_over_reverse_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_in_u (pixman_implementation_t *imp,
-                   pixman_op_t              op,
-                   uint32_t *               dst,
-                   const uint32_t *         src,
-                   const uint32_t *         mask,
-                   int                      width)
-{
-    core_combine_in_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_in_reverse_u (pixman_implementation_t *imp,
-                           pixman_op_t              op,
-                           uint32_t *               dst,
-                           const uint32_t *         src,
-                           const uint32_t *         mask,
-                           int                      width)
-{
-    core_combine_reverse_in_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_out_u (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    uint32_t *               dst,
-                    const uint32_t *         src,
-                    const uint32_t *         mask,
-                    int                      width)
-{
-    core_combine_out_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_out_reverse_u (pixman_implementation_t *imp,
-                            pixman_op_t              op,
-                            uint32_t *               dst,
-                            const uint32_t *         src,
-                            const uint32_t *         mask,
-                            int                      width)
-{
-    core_combine_reverse_out_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_atop_u (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     uint32_t *               dst,
-                     const uint32_t *         src,
-                     const uint32_t *         mask,
-                     int                      width)
-{
-    core_combine_atop_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
-                             pixman_op_t              op,
-                             uint32_t *               dst,
-                             const uint32_t *         src,
-                             const uint32_t *         mask,
-                             int                      width)
-{
-    core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_xor_u (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    uint32_t *               dst,
-                    const uint32_t *         src,
-                    const uint32_t *         mask,
-                    int                      width)
-{
-    core_combine_xor_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_add_u (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    uint32_t *               dst,
-                    const uint32_t *         src,
-                    const uint32_t *         mask,
-                    int                      width)
-{
-    core_combine_add_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_saturate_u (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         uint32_t *               dst,
-                         const uint32_t *         src,
-                         const uint32_t *         mask,
-                         int                      width)
-{
-    core_combine_saturate_u_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_src_ca (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     uint32_t *               dst,
-                     const uint32_t *         src,
-                     const uint32_t *         mask,
-                     int                      width)
-{
-    core_combine_src_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_over_ca (pixman_implementation_t *imp,
-                      pixman_op_t              op,
-                      uint32_t *               dst,
-                      const uint32_t *         src,
-                      const uint32_t *         mask,
-                      int                      width)
-{
-    core_combine_over_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              uint32_t *               dst,
-                              const uint32_t *         src,
-                              const uint32_t *         mask,
-                              int                      width)
-{
-    core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_in_ca (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    uint32_t *               dst,
-                    const uint32_t *         src,
-                    const uint32_t *         mask,
-                    int                      width)
-{
-    core_combine_in_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
-                            pixman_op_t              op,
-                            uint32_t *               dst,
-                            const uint32_t *         src,
-                            const uint32_t *         mask,
-                            int                      width)
-{
-    core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
-
-static void
-sse2_combine_out_ca (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     uint32_t *               dst,
-                     const uint32_t *         src,
-                     const uint32_t *         mask,
-                     int                      width)
-{
-    core_combine_out_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (
+               _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
+               _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
 
-static void
-sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
-                             pixman_op_t              op,
-                             uint32_t *               dst,
-                             const uint32_t *         src,
-                             const uint32_t *         mask,
-                             int                      width)
-{
-    core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
+       ps += 4;
+       pd += 4;
+       pm += 4;
+       w -= 4;
+    }
 
-static void
-sse2_combine_atop_ca (pixman_implementation_t *imp,
-                      pixman_op_t              op,
-                      uint32_t *               dst,
-                      const uint32_t *         src,
-                      const uint32_t *         mask,
-                      int                      width)
-{
-    core_combine_atop_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
-}
+    while (w)
+    {
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
 
-static void
-sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              uint32_t *               dst,
-                              const uint32_t *         src,
-                              const uint32_t *         mask,
-                              int                      width)
-{
-    core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
+       *pd++ = pack_1x128_32 (
+           _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
+                                              unpack_32_1x128 (m)),
+                          unpack_32_1x128 (d)));
+       w--;
+    }
 }
 
-static void
-sse2_combine_xor_ca (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     uint32_t *               dst,
-                     const uint32_t *         src,
-                     const uint32_t *         mask,
-                     int                      width)
+static force_inline __m128i
+create_mask_16_128 (uint16_t mask)
 {
-    core_combine_xor_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
+    return _mm_set1_epi16 (mask);
 }
 
-static void
-sse2_combine_add_ca (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     uint32_t *               dst,
-                     const uint32_t *         src,
-                     const uint32_t *         mask,
-                     int                      width)
+/* Work around a code generation bug in Sun Studio 12. */
+#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
+# define create_mask_2x32_128(mask0, mask1)                            \
+    (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
+#else
+static force_inline __m128i
+create_mask_2x32_128 (uint32_t mask0,
+                      uint32_t mask1)
 {
-    core_combine_add_ca_sse2 (dst, src, mask, width);
-    _mm_empty ();
+    return _mm_set_epi32 (mask0, mask1, mask0, mask1);
 }
-
-/* -------------------------------------------------------------------
- * composite_over_n_8888
- */
+#endif
 
 static void
 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
-                            pixman_op_t              op,
-                            pixman_image_t *         src_image,
-                            pixman_image_t *         mask_image,
-                            pixman_image_t *         dst_image,
-                            int32_t                  src_x,
-                            int32_t                  src_y,
-                            int32_t                  mask_x,
-                            int32_t                  mask_y,
-                            int32_t                  dest_x,
-                            int32_t                  dest_y,
-                            int32_t                  width,
-                            int32_t                  height)
+                            pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t src;
     uint32_t    *dst_line, *dst, d;
     int32_t w;
@@ -2784,13 +2370,13 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp,
     __m128i xmm_src, xmm_alpha;
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
 
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     if (src == 0)
        return;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 
     xmm_src = expand_pixel_32_1x128 (src);
     xmm_alpha = expand_alpha_1x128 (xmm_src);
@@ -2839,27 +2425,13 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp,
        }
 
     }
-    _mm_empty ();
 }
 
-/* ---------------------------------------------------------------------
- * composite_over_n_0565
- */
 static void
 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
-                            pixman_op_t              op,
-                            pixman_image_t *         src_image,
-                            pixman_image_t *         mask_image,
-                            pixman_image_t *         dst_image,
-                            int32_t                  src_x,
-                            int32_t                  src_y,
-                            int32_t                  mask_x,
-                            int32_t                  mask_y,
-                            int32_t                  dest_x,
-                            int32_t                  dest_y,
-                            int32_t                  width,
-                            int32_t                  height)
+                            pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t src;
     uint16_t    *dst_line, *dst, d;
     int32_t w;
@@ -2867,13 +2439,13 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp,
     __m128i xmm_src, xmm_alpha;
     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
 
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     if (src == 0)
        return;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
 
     xmm_src = expand_pixel_32_1x128 (src);
     xmm_alpha = expand_alpha_1x128 (xmm_src);
@@ -2928,55 +2500,38 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp,
        }
     }
 
-    _mm_empty ();
 }
 
-/* ------------------------------
- * composite_add_n_8888_8888_ca
- */
 static void
 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
-                                  pixman_op_t              op,
-                                  pixman_image_t *         src_image,
-                                  pixman_image_t *         mask_image,
-                                  pixman_image_t *         dst_image,
-                                  int32_t                  src_x,
-                                  int32_t                  src_y,
-                                  int32_t                  mask_x,
-                                  int32_t                  mask_y,
-                                  int32_t                  dest_x,
-                                  int32_t                  dest_y,
-                                  int32_t                  width,
-                                  int32_t                  height)
+                                  pixman_composite_info_t *info)
 {
-    uint32_t src, srca;
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
     uint32_t    *dst_line, d;
     uint32_t    *mask_line, m;
     uint32_t pack_cmp;
     int dst_stride, mask_stride;
 
-    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_src;
     __m128i xmm_dst;
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
 
-    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+    __m128i mmx_src, mmx_mask, mmx_dest;
 
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-    srca = src >> 24;
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     if (src == 0)
        return;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
 
     xmm_src = _mm_unpacklo_epi8 (
        create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
-    xmm_alpha = expand_alpha_1x128 (xmm_src);
     mmx_src   = xmm_src;
-    mmx_alpha = xmm_alpha;
 
     while (height--)
     {
@@ -2999,7 +2554,8 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
                mmx_dest = unpack_32_1x128 (d);
 
                *pd = pack_1x128_32 (
-                   _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), mmx_dest));
+                   _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
+                                  mmx_dest));
            }
 
            pd++;
@@ -3047,7 +2603,8 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
                mmx_dest = unpack_32_1x128 (d);
 
                *pd = pack_1x128_32 (
-                   _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), mmx_dest));
+                   _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
+                                  mmx_dest));
            }
 
            pd++;
@@ -3055,28 +2612,13 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
        }
     }
 
-    _mm_empty ();
 }
 
-/* ---------------------------------------------------------------------------
- * composite_over_n_8888_8888_ca
- */
-
 static void
 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
-                                    pixman_op_t              op,
-                                    pixman_image_t *         src_image,
-                                    pixman_image_t *         mask_image,
-                                    pixman_image_t *         dst_image,
-                                    int32_t                  src_x,
-                                    int32_t                  src_y,
-                                    int32_t                  mask_x,
-                                    int32_t                  mask_y,
-                                    int32_t                  dest_x,
-                                    int32_t                  dest_y,
-                                    int32_t                  width,
-                                    int32_t                  height)
+                                    pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t src;
     uint32_t    *dst_line, d;
     uint32_t    *mask_line, m;
@@ -3089,13 +2631,13 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
 
     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
 
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     if (src == 0)
        return;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
 
@@ -3183,28 +2725,13 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
        }
     }
 
-    _mm_empty ();
 }
 
-/*---------------------------------------------------------------------
- * composite_over_8888_n_8888
- */
-
 static void
 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 pixman_image_t *         src_image,
-                                 pixman_image_t *         mask_image,
-                                 pixman_image_t *         dst_image,
-                                 int32_t                  src_x,
-                                 int32_t                  src_y,
-                                 int32_t                  mask_x,
-                                 int32_t                  mask_y,
-                                 int32_t                  dest_x,
-                                 int32_t                  dest_y,
-                                 int32_t                  width,
-                                 int32_t                  height)
+                                 pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t    *dst_line, *dst;
     uint32_t    *src_line, *src;
     uint32_t mask;
@@ -3217,7 +2744,7 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
     __m128i xmm_alpha_lo, xmm_alpha_hi;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
@@ -3302,28 +2829,13 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
        }
     }
 
-    _mm_empty ();
 }
 
-/*---------------------------------------------------------------------
- * composite_over_8888_n_8888
- */
-
 static void
 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
-                             pixman_op_t              op,
-                             pixman_image_t *         src_image,
-                             pixman_image_t *         mask_image,
-                             pixman_image_t *         dst_image,
-                             int32_t                  src_x,
-                             int32_t                  src_y,
-                             int32_t                  mask_x,
-                             int32_t                  mask_y,
-                             int32_t                  dest_x,
-                             int32_t                  dest_y,
-                             int32_t                  width,
-                             int32_t                  height)
+                             pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t    *dst_line, *dst;
     uint32_t    *src_line, *src;
     int32_t w;
@@ -3331,7 +2843,7 @@ sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
 
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
@@ -3375,27 +2887,13 @@ sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
        }
     }
 
-    _mm_empty ();
 }
 
-/* ---------------------------------------------------------------------
- * composite_over_x888_n_8888
- */
 static void
 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 pixman_image_t *         src_image,
-                                 pixman_image_t *         mask_image,
-                                 pixman_image_t *         dst_image,
-                                 int32_t                  src_x,
-                                 int32_t                  src_y,
-                                 int32_t                  mask_x,
-                                 int32_t                  mask_y,
-                                 int32_t                  dest_x,
-                                 int32_t                  dest_y,
-                                 int32_t                  width,
-                                 int32_t                  height)
+                                 pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t    *dst_line, *dst;
     uint32_t    *src_line, *src;
     uint32_t mask;
@@ -3407,7 +2905,7 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
@@ -3480,33 +2978,19 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
        }
     }
 
-    _mm_empty ();
 }
 
-/* --------------------------------------------------------------------
- * composite_over_8888_8888
- */
 static void
 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
-                               pixman_op_t              op,
-                               pixman_image_t *         src_image,
-                               pixman_image_t *         mask_image,
-                               pixman_image_t *         dst_image,
-                               int32_t                  src_x,
-                               int32_t                  src_y,
-                               int32_t                  mask_x,
-                               int32_t                  mask_y,
-                               int32_t                  dest_x,
-                               int32_t                  dest_y,
-                               int32_t                  width,
-                               int32_t                  height)
+                               pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     int dst_stride, src_stride;
     uint32_t    *dst_line, *dst;
     uint32_t    *src_line, *src;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
@@ -3515,17 +2999,13 @@ sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
 
     while (height--)
     {
-       core_combine_over_u_sse2 (dst, src, NULL, width);
+       sse2_combine_over_u (imp, op, dst, src, NULL, width);
 
        dst += dst_stride;
        src += src_stride;
     }
-    _mm_empty ();
 }
 
-/* ------------------------------------------------------------------
- * composite_over_8888_0565
- */
 static force_inline uint16_t
 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
 {
@@ -3540,19 +3020,9 @@ composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
 
 static void
 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
-                               pixman_op_t              op,
-                               pixman_image_t *         src_image,
-                               pixman_image_t *         mask_image,
-                               pixman_image_t *         dst_image,
-                               int32_t                  src_x,
-                               int32_t                  src_y,
-                               int32_t                  mask_x,
-                               int32_t                  mask_y,
-                               int32_t                  dest_x,
-                               int32_t                  dest_y,
-                               int32_t                  width,
-                               int32_t                  height)
+                               pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint16_t    *dst_line, *dst, d;
     uint32_t    *src_line, *src, s;
     int dst_stride, src_stride;
@@ -3563,19 +3033,10 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
-#if 0
-    /* FIXME
-     *
-     * I copy the code from MMX one and keep the fixme.
-     * If it's a problem there, probably is a problem here.
-     */
-    assert (src_image->drawable == mask_image->drawable);
-#endif
-
     while (height--)
     {
        dst = dst_line;
@@ -3648,28 +3109,13 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
        }
     }
 
-    _mm_empty ();
 }
 
-/* -----------------------------------------------------------------
- * composite_over_n_8_8888
- */
-
 static void
 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
+                              pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t src, srca;
     uint32_t *dst_line, *dst;
     uint8_t *mask_line, *mask;
@@ -3683,14 +3129,14 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
 
     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
 
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     srca = src >> 24;
     if (src == 0)
        return;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
@@ -3784,14 +3230,9 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
        }
     }
 
-    _mm_empty ();
 }
 
-/* ----------------------------------------------------------------
- * composite_over_n_8_8888
- */
-
-pixman_bool_t
+static pixman_bool_t
 pixman_fill_sse2 (uint32_t *bits,
                   int       stride,
                   int       bpp,
@@ -3850,7 +3291,7 @@ pixman_fill_sse2 (uint32_t *bits,
        byte_line += stride;
        w = byte_width;
 
-       while (w >= 1 && ((unsigned long)d & 1))
+       if (w >= 1 && ((unsigned long)d & 1))
        {
            *(uint8_t *)d = data;
            w -= 1;
@@ -3938,25 +3379,14 @@ pixman_fill_sse2 (uint32_t *bits,
        }
     }
 
-    _mm_empty ();
     return TRUE;
 }
 
 static void
 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
-                             pixman_op_t              op,
-                             pixman_image_t *         src_image,
-                             pixman_image_t *         mask_image,
-                             pixman_image_t *         dst_image,
-                             int32_t                  src_x,
-                             int32_t                  src_y,
-                             int32_t                  mask_x,
-                             int32_t                  mask_y,
-                             int32_t                  dest_x,
-                             int32_t                  dest_y,
-                             int32_t                  width,
-                             int32_t                  height)
+                             pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t src, srca;
     uint32_t    *dst_line, *dst;
     uint8_t     *mask_line, *mask;
@@ -3967,19 +3397,19 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
     __m128i xmm_src, xmm_def;
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
 
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     srca = src >> 24;
     if (src == 0)
     {
-       pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
-                         PIXMAN_FORMAT_BPP (dst_image->bits.format),
+       pixman_fill_sse2 (dest_image->bits.bits, dest_image->bits.rowstride,
+                         PIXMAN_FORMAT_BPP (dest_image->bits.format),
                          dest_x, dest_y, width, height, 0);
        return;
     }
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
@@ -4068,29 +3498,14 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
        }
     }
 
-    _mm_empty ();
 }
 
-/*-----------------------------------------------------------------------
- * composite_over_n_8_0565
- */
-
 static void
 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
+                              pixman_composite_info_t *info)
 {
-    uint32_t src, srca;
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
     uint16_t    *dst_line, *dst, d;
     uint8_t     *mask_line, *mask;
     int dst_stride, mask_stride;
@@ -4102,14 +3517,13 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
 
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
-    srca = src >> 24;
     if (src == 0)
        return;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
@@ -4220,28 +3634,13 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
        }
     }
 
-    _mm_empty ();
 }
 
-/* -----------------------------------------------------------------------
- * composite_over_pixbuf_0565
- */
-
 static void
 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 pixman_image_t *         src_image,
-                                 pixman_image_t *         mask_image,
-                                 pixman_image_t *         dst_image,
-                                 int32_t                  src_x,
-                                 int32_t                  src_y,
-                                 int32_t                  mask_x,
-                                 int32_t                  mask_y,
-                                 int32_t                  dest_x,
-                                 int32_t                  dest_y,
-                                 int32_t                  width,
-                                 int32_t                  height)
+                                 pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint16_t    *dst_line, *dst, d;
     uint32_t    *src_line, *src, s;
     int dst_stride, src_stride;
@@ -4253,19 +3652,10 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
-#if 0
-    /* FIXME
-     *
-     * I copy the code from MMX one and keep the fixme.
-     * If it's a problem there, probably is a problem here.
-     */
-    assert (src_image->drawable == mask_image->drawable);
-#endif
-
     while (height--)
     {
        dst = dst_line;
@@ -4354,28 +3744,13 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
        }
     }
 
-    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------
- * composite_over_pixbuf_8888
- */
-
 static void
 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 pixman_image_t *         src_image,
-                                 pixman_image_t *         mask_image,
-                                 pixman_image_t *         dst_image,
-                                 int32_t                  src_x,
-                                 int32_t                  src_y,
-                                 int32_t                  mask_x,
-                                 int32_t                  mask_y,
-                                 int32_t                  dest_x,
-                                 int32_t                  dest_y,
-                                 int32_t                  width,
-                                 int32_t                  height)
+                                 pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t    *dst_line, *dst, d;
     uint32_t    *src_line, *src, s;
     int dst_stride, src_stride;
@@ -4386,19 +3761,10 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
     __m128i xmm_dst_lo, xmm_dst_hi;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
-#if 0
-    /* FIXME
-     *
-     * I copy the code from MMX one and keep the fixme.
-     * If it's a problem there, probably is a problem here.
-     */
-    assert (src_image->drawable == mask_image->drawable);
-#endif
-
     while (height--)
     {
        dst = dst_line;
@@ -4467,28 +3833,13 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
        }
     }
 
-    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * composite_over_n_8888_0565_ca
- */
-
 static void
 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
-                                    pixman_op_t              op,
-                                    pixman_image_t *         src_image,
-                                    pixman_image_t *         mask_image,
-                                    pixman_image_t *         dst_image,
-                                    int32_t                  src_x,
-                                    int32_t                  src_y,
-                                    int32_t                  mask_x,
-                                    int32_t                  mask_y,
-                                    int32_t                  dest_x,
-                                    int32_t                  dest_y,
-                                    int32_t                  width,
-                                    int32_t                  height)
+                                    pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t src;
     uint16_t    *dst_line, *dst, d;
     uint32_t    *mask_line, *mask, m;
@@ -4502,13 +3853,13 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 
     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
 
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     if (src == 0)
        return;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
 
@@ -4616,34 +3967,18 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
        }
     }
 
-    _mm_empty ();
 }
 
-/* -----------------------------------------------------------------------
- * composite_in_n_8_8
- */
-
 static void
 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         pixman_image_t *         src_image,
-                         pixman_image_t *         mask_image,
-                         pixman_image_t *         dst_image,
-                         int32_t                  src_x,
-                         int32_t                  src_y,
-                         int32_t                  mask_x,
-                         int32_t                  mask_y,
-                         int32_t                  dest_x,
-                         int32_t                  dest_y,
-                         int32_t                  width,
-                         int32_t                  height)
+                         pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint8_t     *dst_line, *dst;
     uint8_t     *mask_line, *mask;
     int dst_stride, mask_stride;
     uint32_t d, m;
     uint32_t src;
-    uint8_t sa;
     int32_t w;
 
     __m128i xmm_alpha;
@@ -4651,13 +3986,11 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
-    sa = src >> 24;
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
 
@@ -4720,28 +4053,13 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
        }
     }
 
-    _mm_empty ();
 }
 
-/* -----------------------------------------------------------------------
- * composite_in_n_8
- */
-
 static void
 sse2_composite_in_n_8 (pixman_implementation_t *imp,
-                      pixman_op_t              op,
-                      pixman_image_t *         src_image,
-                      pixman_image_t *         mask_image,
-                      pixman_image_t *         dst_image,
-                      int32_t                  src_x,
-                      int32_t                  src_y,
-                      int32_t                  mask_x,
-                      int32_t                  mask_y,
-                      int32_t                  dest_x,
-                      int32_t                  dest_y,
-                      int32_t                  width,
-                      int32_t                  height)
+                      pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint8_t     *dst_line, *dst;
     int dst_stride;
     uint32_t d;
@@ -4752,9 +4070,9 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp,
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
 
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
 
@@ -4765,7 +4083,7 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp,
 
     if (src == 0x00)
     {
-       pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
+       pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
                     8, dest_x, dest_y, width, height, src);
 
        return;
@@ -4817,28 +4135,13 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp,
        }
     }
 
-    _mm_empty ();
 }
 
-/* ---------------------------------------------------------------------------
- * composite_in_8_8
- */
-
 static void
 sse2_composite_in_8_8 (pixman_implementation_t *imp,
-                       pixman_op_t              op,
-                       pixman_image_t *         src_image,
-                       pixman_image_t *         mask_image,
-                       pixman_image_t *         dst_image,
-                       int32_t                  src_x,
-                       int32_t                  src_y,
-                       int32_t                  mask_x,
-                       int32_t                  mask_y,
-                       int32_t                  dest_x,
-                       int32_t                  dest_y,
-                       int32_t                  width,
-                       int32_t                  height)
+                       pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint8_t     *dst_line, *dst;
     uint8_t     *src_line, *src;
     int src_stride, dst_stride;
@@ -4849,7 +4152,7 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp,
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
 
@@ -4903,34 +4206,18 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp,
        }
     }
 
-    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------
- * composite_add_n_8_8
- */
-
 static void
 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
-                         pixman_op_t              op,
-                         pixman_image_t *         src_image,
-                         pixman_image_t *         mask_image,
-                         pixman_image_t *         dst_image,
-                         int32_t                  src_x,
-                         int32_t                  src_y,
-                         int32_t                  mask_x,
-                         int32_t                  mask_y,
-                         int32_t                  dest_x,
-                         int32_t                  dest_y,
-                         int32_t                  width,
-                         int32_t                  height)
+                         pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint8_t     *dst_line, *dst;
     uint8_t     *mask_line, *mask;
     int dst_stride, mask_stride;
     int32_t w;
     uint32_t src;
-    uint8_t sa;
     uint32_t m, d;
 
     __m128i xmm_alpha;
@@ -4938,13 +4225,11 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
-
-    sa = src >> 24;
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
 
@@ -5007,28 +4292,13 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
        }
     }
 
-    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------
- * composite_add_n_8_8
- */
-
 static void
 sse2_composite_add_n_8 (pixman_implementation_t *imp,
-                       pixman_op_t              op,
-                       pixman_image_t *         src_image,
-                       pixman_image_t *         mask_image,
-                       pixman_image_t *         dst_image,
-                       int32_t                  src_x,
-                       int32_t                  src_y,
-                       int32_t                  mask_x,
-                       int32_t                  mask_y,
-                       int32_t                  dest_x,
-                       int32_t                  dest_y,
-                       int32_t                  width,
-                       int32_t                  height)
+                       pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint8_t     *dst_line, *dst;
     int dst_stride;
     int32_t w;
@@ -5037,9 +4307,9 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp,
     __m128i xmm_src;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
 
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     src >>= 24;
 
@@ -5048,7 +4318,7 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp,
 
     if (src == 0xff)
     {
-       pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
+       pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
                     8, dest_x, dest_y, width, height, 0xff);
 
        return;
@@ -5095,28 +4365,13 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp,
        }
     }
 
-    _mm_empty ();
 }
 
-/* ----------------------------------------------------------------------
- * composite_add_8_8
- */
-
 static void
 sse2_composite_add_8_8 (pixman_implementation_t *imp,
-                       pixman_op_t              op,
-                       pixman_image_t *         src_image,
-                       pixman_image_t *         mask_image,
-                       pixman_image_t *         dst_image,
-                       int32_t                  src_x,
-                       int32_t                  src_y,
-                       int32_t                  mask_x,
-                       int32_t                  mask_y,
-                       int32_t                  dest_x,
-                       int32_t                  dest_y,
-                       int32_t                  width,
-                       int32_t                  height)
+                       pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint8_t     *dst_line, *dst;
     uint8_t     *src_line, *src;
     int dst_stride, src_stride;
@@ -5126,7 +4381,7 @@ sse2_composite_add_8_8 (pixman_implementation_t *imp,
     PIXMAN_IMAGE_GET_LINE (
        src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
 
     while (height--)
     {
@@ -5145,7 +4400,8 @@ sse2_composite_add_8_8 (pixman_implementation_t *imp,
            w--;
        }
 
-       core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
+       sse2_combine_add_u (imp, op,
+                           (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
 
        /* Small tail */
        dst += w & 0xfffc;
@@ -5161,27 +4417,13 @@ sse2_composite_add_8_8 (pixman_implementation_t *imp,
        }
     }
 
-    _mm_empty ();
 }
 
-/* ---------------------------------------------------------------------
- * composite_add_8888_8888
- */
 static void
 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
-                              pixman_op_t              op,
-                              pixman_image_t *         src_image,
-                              pixman_image_t *         mask_image,
-                              pixman_image_t *         dst_image,
-                              int32_t                  src_x,
-                              int32_t                  src_y,
-                              int32_t                  mask_x,
-                              int32_t                  mask_y,
-                              int32_t                  dest_x,
-                              int32_t                  dest_y,
-                              int32_t                  width,
-                              int32_t                  height)
+                              pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t    *dst_line, *dst;
     uint32_t    *src_line, *src;
     int dst_stride, src_stride;
@@ -5189,7 +4431,7 @@ sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
     PIXMAN_IMAGE_GET_LINE (
        src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 
     while (height--)
     {
@@ -5198,16 +4440,11 @@ sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
        src = src_line;
        src_line += src_stride;
 
-       core_combine_add_u_sse2 (dst, src, NULL, width);
+       sse2_combine_add_u (imp, op, dst, src, NULL, width);
     }
 
-    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * sse2_composite_copy_area
- */
-
 static pixman_bool_t
 pixman_blt_sse2 (uint32_t *src_bits,
                  uint32_t *dst_bits,
@@ -5217,8 +4454,8 @@ pixman_blt_sse2 (uint32_t *src_bits,
                  int       dst_bpp,
                  int       src_x,
                  int       src_y,
-                 int       dst_x,
-                 int       dst_y,
+                 int       dest_x,
+                 int       dest_y,
                  int       width,
                  int       height)
 {
@@ -5234,7 +4471,7 @@ pixman_blt_sse2 (uint32_t *src_bits,
        src_stride = src_stride * (int) sizeof (uint32_t) / 2;
        dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
        src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
-       dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
+       dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
        byte_width = 2 * width;
        src_stride *= 2;
        dst_stride *= 2;
@@ -5244,7 +4481,7 @@ pixman_blt_sse2 (uint32_t *src_bits,
        src_stride = src_stride * (int) sizeof (uint32_t) / 4;
        dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
        src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
-       dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
+       dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
        byte_width = 4 * width;
        src_stride *= 4;
        dst_stride *= 4;
@@ -5326,50 +4563,29 @@ pixman_blt_sse2 (uint32_t *src_bits,
        }
     }
 
-    _mm_empty ();
 
     return TRUE;
 }
 
 static void
 sse2_composite_copy_area (pixman_implementation_t *imp,
-                          pixman_op_t              op,
-                          pixman_image_t *         src_image,
-                          pixman_image_t *         mask_image,
-                          pixman_image_t *         dst_image,
-                          int32_t                  src_x,
-                          int32_t                  src_y,
-                          int32_t                  mask_x,
-                          int32_t                  mask_y,
-                          int32_t                  dest_x,
-                          int32_t                  dest_y,
-                          int32_t                  width,
-                          int32_t                  height)
+                          pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     pixman_blt_sse2 (src_image->bits.bits,
-                     dst_image->bits.bits,
+                     dest_image->bits.bits,
                      src_image->bits.rowstride,
-                     dst_image->bits.rowstride,
+                     dest_image->bits.rowstride,
                      PIXMAN_FORMAT_BPP (src_image->bits.format),
-                     PIXMAN_FORMAT_BPP (dst_image->bits.format),
+                     PIXMAN_FORMAT_BPP (dest_image->bits.format),
                      src_x, src_y, dest_x, dest_y, width, height);
 }
 
 static void
 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 pixman_image_t *         src_image,
-                                 pixman_image_t *         mask_image,
-                                 pixman_image_t *         dst_image,
-                                 int32_t                  src_x,
-                                 int32_t                  src_y,
-                                 int32_t                  mask_x,
-                                 int32_t                  mask_y,
-                                 int32_t                  dest_x,
-                                 int32_t                  dest_y,
-                                 int32_t                  width,
-                                 int32_t                  height)
+                                 pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t    *src, *src_line, s;
     uint32_t    *dst, *dst_line, d;
     uint8_t         *mask, *mask_line;
@@ -5383,7 +4599,7 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
     PIXMAN_IMAGE_GET_LINE (
@@ -5422,7 +4638,8 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
         while (w >= 4)
         {
             m = *(uint32_t*) mask;
-            xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
+            xmm_src = _mm_or_si128 (
+               load_128_unaligned ((__m128i*)src), mask_ff000000);
 
             if (m == 0xffffffff)
             {
@@ -5438,9 +4655,12 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
 
-                expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+                expand_alpha_rev_2x128 (
+                   xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
 
-                in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+                in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+                              &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
+                              &xmm_dst_lo, &xmm_dst_hi);
 
                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
             }
@@ -5484,24 +4704,13 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
         }
     }
 
-    _mm_empty ();
 }
 
 static void
 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
-                                 pixman_op_t              op,
-                                 pixman_image_t *         src_image,
-                                 pixman_image_t *         mask_image,
-                                 pixman_image_t *         dst_image,
-                                 int32_t                  src_x,
-                                 int32_t                  src_y,
-                                 int32_t                  mask_x,
-                                 int32_t                  mask_y,
-                                 int32_t                  dest_x,
-                                 int32_t                  dest_y,
-                                 int32_t                  width,
-                                 int32_t                  height)
+                                 pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t    *src, *src_line, s;
     uint32_t    *dst, *dst_line, d;
     uint8_t         *mask, *mask_line;
@@ -5514,7 +4723,7 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
     PIXMAN_IMAGE_GET_LINE (
@@ -5638,24 +4847,13 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
         }
     }
 
-    _mm_empty ();
 }
 
 static void
 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
-                                   pixman_op_t              op,
-                                   pixman_image_t *         src_image,
-                                   pixman_image_t *         mask_image,
-                                   pixman_image_t *         dst_image,
-                                   int32_t                  src_x,
-                                   int32_t                  src_y,
-                                   int32_t                  mask_x,
-                                   int32_t                  mask_y,
-                                   int32_t                  dest_x,
-                                   int32_t                  dest_y,
-                                   int32_t                  width,
-                                   int32_t                  height)
+                                   pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t src;
     uint32_t    *dst_line, *dst;
     __m128i xmm_src;
@@ -5664,13 +4862,13 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
     int dst_stride;
     int32_t w;
 
-    src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
 
     if (src == 0)
        return;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 
     xmm_src = expand_pixel_32_1x128 (src);
 
@@ -5730,24 +4928,13 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
 
     }
 
-    _mm_empty ();
 }
 
 static void
 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
-                                   pixman_op_t              op,
-                                   pixman_image_t *         src_image,
-                                   pixman_image_t *         mask_image,
-                                   pixman_image_t *         dst_image,
-                                   int32_t                  src_x,
-                                   int32_t                  src_y,
-                                   int32_t                  mask_x,
-                                   int32_t                  mask_y,
-                                   int32_t                  dest_x,
-                                   int32_t                  dest_y,
-                                   int32_t                  width,
-                                   int32_t                  height)
+                                   pixman_composite_info_t *info)
 {
+    PIXMAN_COMPOSITE_ARGS (info);
     uint32_t    *src, *src_line, s;
     uint32_t    *dst, *dst_line, d;
     uint32_t    *mask, *mask_line;
@@ -5760,7 +4947,7 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
 
     PIXMAN_IMAGE_GET_LINE (
-       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (
        mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
     PIXMAN_IMAGE_GET_LINE (
@@ -5882,10 +5069,9 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
         }
     }
 
-    _mm_empty ();
 }
 
-/* A variant of 'core_combine_over_u_sse2' with minor tweaks */
+/* A variant of 'sse2_combine_over_u' with minor tweaks */
 static force_inline void
 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
                                              const uint32_t* ps,
@@ -5977,7 +5163,6 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
 
        w--;
     }
-    _mm_empty ();
 }
 
 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
@@ -6090,7 +5275,6 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
        w--;
     }
 
-    _mm_empty ();
 }
 
 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
@@ -6103,6 +5287,379 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
                              scaled_nearest_scanline_sse2_8888_n_8888_OVER,
                              uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
 
+#define BILINEAR_DECLARE_VARIABLES                                             \
+    const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);     \
+    const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);     \
+    const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);\
+    const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);           \
+    const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x,      \
+                                         unit_x, unit_x, unit_x, unit_x);      \
+    const __m128i xmm_zero = _mm_setzero_si128 ();                             \
+    __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx, vx, vx, vx, vx)
+
+#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)                                    \
+do {                                                                           \
+    __m128i xmm_wh, xmm_lo, xmm_hi, a;                                         \
+    /* fetch 2x2 pixel block into sse2 register */                             \
+    uint32_t tl = src_top [pixman_fixed_to_int (vx)];                          \
+    uint32_t tr = src_top [pixman_fixed_to_int (vx) + 1];                      \
+    uint32_t bl = src_bottom [pixman_fixed_to_int (vx)];                       \
+    uint32_t br = src_bottom [pixman_fixed_to_int (vx) + 1];                   \
+    a = _mm_set_epi32 (tr, tl, br, bl);                                                \
+    vx += unit_x;                                                              \
+    /* vertical interpolation */                                               \
+    a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpackhi_epi8 (a, xmm_zero),       \
+                                       xmm_wt),                                \
+                      _mm_mullo_epi16 (_mm_unpacklo_epi8 (a, xmm_zero),        \
+                                       xmm_wb));                               \
+    /* calculate horizontal weights */                                         \
+    xmm_wh = _mm_add_epi16 (xmm_addc,                                          \
+                           _mm_xor_si128 (xmm_xorc,                            \
+                                          _mm_srli_epi16 (xmm_x, 8)));         \
+    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);                                     \
+    /* horizontal interpolation */                                             \
+    xmm_lo = _mm_mullo_epi16 (a, xmm_wh);                                      \
+    xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);                                      \
+    a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),                    \
+                      _mm_unpackhi_epi16 (xmm_lo, xmm_hi));                    \
+    /* shift and pack the result */                                            \
+    a = _mm_srli_epi32 (a, 16);                                                        \
+    a = _mm_packs_epi32 (a, a);                                                        \
+    a = _mm_packus_epi16 (a, a);                                               \
+    pix = _mm_cvtsi128_si32 (a);                                               \
+} while (0)
+
+#define BILINEAR_SKIP_ONE_PIXEL()                                              \
+do {                                                                           \
+    vx += unit_x;                                                              \
+    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);                                     \
+} while(0)
+
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,
+                                            const uint32_t * mask,
+                                            const uint32_t * src_top,
+                                            const uint32_t * src_bottom,
+                                            int32_t          w,
+                                            int              wt,
+                                            int              wb,
+                                            pixman_fixed_t   vx,
+                                            pixman_fixed_t   unit_x,
+                                            pixman_fixed_t   max_vx,
+                                            pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    uint32_t pix1, pix2, pix3, pix4;
+
+    while ((w -= 4) >= 0)
+    {
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
+       *dst++ = pix1;
+       *dst++ = pix2;
+       *dst++ = pix3;
+       *dst++ = pix4;
+    }
+
+    if (w & 2)
+    {
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+       *dst++ = pix1;
+       *dst++ = pix2;
+    }
+
+    if (w & 1)
+    {
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+       *dst = pix1;
+    }
+
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
+                              scaled_bilinear_scanline_sse2_8888_8888_SRC,
+                              uint32_t, uint32_t, uint32_t,
+                              COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
+                              scaled_bilinear_scanline_sse2_8888_8888_SRC,
+                              uint32_t, uint32_t, uint32_t,
+                              PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
+                              scaled_bilinear_scanline_sse2_8888_8888_SRC,
+                              uint32_t, uint32_t, uint32_t,
+                              NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
+                              scaled_bilinear_scanline_sse2_8888_8888_SRC,
+                              uint32_t, uint32_t, uint32_t,
+                              NORMAL, FLAG_NONE)
+
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,
+                                             const uint32_t * mask,
+                                             const uint32_t * src_top,
+                                             const uint32_t * src_bottom,
+                                             int32_t          w,
+                                             int              wt,
+                                             int              wb,
+                                             pixman_fixed_t   vx,
+                                             pixman_fixed_t   unit_x,
+                                             pixman_fixed_t   max_vx,
+                                             pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    uint32_t pix1, pix2, pix3, pix4;
+
+    while (w && ((unsigned long)dst & 15))
+    {
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+
+       if (pix1)
+       {
+           pix2 = *dst;
+           *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
+       }
+
+       w--;
+       dst++;
+    }
+
+    while (w  >= 4)
+    {
+       __m128i xmm_src;
+       __m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
+       __m128i xmm_alpha_hi, xmm_alpha_lo;
+
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
+
+       xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
+
+       if (!is_zero (xmm_src))
+       {
+           if (is_opaque (xmm_src))
+           {
+               save_128_aligned ((__m128i *)dst, xmm_src);
+           }
+           else
+           {
+               __m128i xmm_dst = load_128_aligned ((__m128i *)dst);
+
+               unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+               unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+               expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+               over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi,
+                           &xmm_dst_lo, &xmm_dst_hi);
+
+               save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+           }
+       }
+
+       w -= 4;
+       dst += 4;
+    }
+
+    while (w)
+    {
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+
+       if (pix1)
+       {
+           pix2 = *dst;
+           *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
+       }
+
+       w--;
+       dst++;
+    }
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
+                              scaled_bilinear_scanline_sse2_8888_8888_OVER,
+                              uint32_t, uint32_t, uint32_t,
+                              COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
+                              scaled_bilinear_scanline_sse2_8888_8888_OVER,
+                              uint32_t, uint32_t, uint32_t,
+                              PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
+                              scaled_bilinear_scanline_sse2_8888_8888_OVER,
+                              uint32_t, uint32_t, uint32_t,
+                              NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
+                              scaled_bilinear_scanline_sse2_8888_8888_OVER,
+                              uint32_t, uint32_t, uint32_t,
+                              NORMAL, FLAG_NONE)
+
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,
+                                               const uint8_t  * mask,
+                                               const uint32_t * src_top,
+                                               const uint32_t * src_bottom,
+                                               int32_t          w,
+                                               int              wt,
+                                               int              wb,
+                                               pixman_fixed_t   vx,
+                                               pixman_fixed_t   unit_x,
+                                               pixman_fixed_t   max_vx,
+                                               pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    uint32_t pix1, pix2, pix3, pix4;
+    uint32_t m;
+
+    while (w && ((unsigned long)dst & 15))
+    {
+       uint32_t sa;
+
+       m = (uint32_t) *mask++;
+
+       if (m)
+       {
+           BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+           sa = pix1 >> 24;
+
+           if (sa == 0xff && m == 0xff)
+           {
+               *dst = pix1;
+           }
+           else
+           {
+               __m128i ms, md, ma, msa;
+
+               pix2 = *dst;
+               ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+               ms = unpack_32_1x128 (pix1);
+               md = unpack_32_1x128 (pix2);
+
+               msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+               *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+           }
+       }
+       else
+       {
+           BILINEAR_SKIP_ONE_PIXEL ();
+       }
+
+       w--;
+       dst++;
+    }
+
+    while (w >= 4)
+    {
+       __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
+       __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+       __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+       m = *(uint32_t*)mask;
+
+       if (m)
+       {
+           BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+           BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+           BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
+           BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
+
+           xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
+
+           if (m == 0xffffffff && is_opaque (xmm_src))
+           {
+               save_128_aligned ((__m128i *)dst, xmm_src);
+           }
+           else
+           {
+               xmm_dst = load_128_aligned ((__m128i *)dst);
+
+               xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
+
+               unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+               unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+               unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+               expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
+               expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+               in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
+                              &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+               save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+           }
+       }
+       else
+       {
+           BILINEAR_SKIP_ONE_PIXEL ();
+           BILINEAR_SKIP_ONE_PIXEL ();
+           BILINEAR_SKIP_ONE_PIXEL ();
+           BILINEAR_SKIP_ONE_PIXEL ();
+       }
+
+       w -= 4;
+       dst += 4;
+       mask += 4;
+    }
+
+    while (w)
+    {
+       uint32_t sa;
+
+       m = (uint32_t) *mask++;
+
+       if (m)
+       {
+           BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+           sa = pix1 >> 24;
+
+           if (sa == 0xff && m == 0xff)
+           {
+               *dst = pix1;
+           }
+           else
+           {
+               __m128i ms, md, ma, msa;
+
+               pix2 = *dst;
+               ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+               ms = unpack_32_1x128 (pix1);
+               md = unpack_32_1x128 (pix2);
+
+               msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+               *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+           }
+       }
+       else
+       {
+           BILINEAR_SKIP_ONE_PIXEL ();
+       }
+
+       w--;
+       dst++;
+    }
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
+                              scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+                              uint32_t, uint8_t, uint32_t,
+                              COVER, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
+                              scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+                              uint32_t, uint8_t, uint32_t,
+                              PAD, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
+                              scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+                              uint32_t, uint8_t, uint32_t,
+                              NONE, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
+                              scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+                              uint32_t, uint8_t, uint32_t,
+                              NORMAL, FLAG_HAVE_NON_SOLID_MASK)
+
 static const pixman_fast_path_t sse2_fast_paths[] =
 {
     /* PIXMAN_OP_OVER */
@@ -6204,6 +5761,20 @@ static const pixman_fast_path_t sse2_fast_paths[] =
     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
 
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
+
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888),
+
     { PIXMAN_OP_NONE },
 };
 
@@ -6217,20 +5788,20 @@ sse2_blt (pixman_implementation_t *imp,
           int                      dst_bpp,
           int                      src_x,
           int                      src_y,
-          int                      dst_x,
-          int                      dst_y,
+          int                      dest_x,
+          int                      dest_y,
           int                      width,
           int                      height)
 {
     if (!pixman_blt_sse2 (
             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
-            src_x, src_y, dst_x, dst_y, width, height))
+            src_x, src_y, dest_x, dest_y, width, height))
 
     {
        return _pixman_implementation_blt (
            imp->delegate,
            src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
-           src_x, src_y, dst_x, dst_y, width, height);
+           src_x, src_y, dest_x, dest_y, width, height);
     }
 
     return TRUE;
@@ -6402,19 +5973,21 @@ static const fetcher_info_t fetchers[] =
 };
 
 static void
-sse2_src_iter_init (pixman_implementation_t *imp,
-                   pixman_iter_t *iter,
-                   pixman_image_t *image,
-                   int x, int y, int width, int height,
-                   uint8_t *buffer, iter_flags_t flags)
+sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
 {
+    pixman_image_t *image = iter->image;
+    int x = iter->x;
+    int y = iter->y;
+    int width = iter->width;
+    int height = iter->height;
+
 #define FLAGS                                                          \
-    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
+    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | FAST_PATH_BITS_IMAGE)
 
-    if ((flags & ITER_NARROW)                          &&
-       (image->common.flags & FLAGS) == FLAGS          &&
-       x >= 0 && y >= 0                                &&
-       x + width <= image->bits.width                  &&
+    if ((iter->flags & ITER_NARROW)                            &&
+       (image->common.flags & FLAGS) == FLAGS                  &&
+       x >= 0 && y >= 0                                        &&
+       x + width <= image->bits.width                          &&
        y + height <= image->bits.height)
     {
        const fetcher_info_t *f;
@@ -6426,10 +5999,8 @@ sse2_src_iter_init (pixman_implementation_t *imp,
                uint8_t *b = (uint8_t *)image->bits.bits;
                int s = image->bits.rowstride * 4;
 
-               iter->bits = b + s * y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
+               iter->bits = b + s * iter->y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
                iter->stride = s;
-               iter->width = width;
-               iter->buffer = (uint32_t *)buffer;
 
                iter->get_scanline = f->get_scanline;
                return;
@@ -6437,8 +6008,7 @@ sse2_src_iter_init (pixman_implementation_t *imp,
        }
     }
 
-    _pixman_implementation_src_iter_init (
-       imp->delegate, iter, image, x, y, width, height, buffer, flags);
+    imp->delegate->src_iter_init (imp->delegate, iter);
 }
 
 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
@@ -6466,20 +6036,7 @@ _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
 
-    /* MMX constants */
-    mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
-    mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
-
-    mask_x0080 = create_mask_16_64 (0x0080);
-    mask_x00ff = create_mask_16_64 (0x00ff);
-    mask_x0101 = create_mask_16_64 (0x0101);
-    mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
-
-    _mm_empty ();
-
     /* Set up function pointers */
-
-    /* SSE code patch for fbcompose.c */
     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
     imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
@@ -6512,5 +6069,3 @@ _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
 
     return imp;
 }
-
-#endif /* USE_SSE2 */