Reformat and reindent pixman-sse2.c
authorSøren Sandmann Pedersen <sandmann@redhat.com>
Mon, 13 Jul 2009 01:36:32 +0000 (21:36 -0400)
committerSøren Sandmann Pedersen <sandmann@redhat.com>
Mon, 13 Jul 2009 23:55:34 +0000 (19:55 -0400)
pixman/pixman-sse2.c

index 334990d..cb3daf2 100644 (file)
@@ -23,7 +23,7 @@
  *
  * Author:  Rodrigo Kumpera (kumpera@gmail.com)
  *          André Tupinambá (andrelrt@gmail.com)
- * 
+ *
  * Based on work by Owen Taylor and Søren Sandmann
  */
 #ifdef HAVE_CONFIG_H
@@ -38,7 +38,7 @@
 
 #ifdef USE_SSE2
 
-/* -------------------------------------------------------------------------------------------------
+/* --------------------------------------------------------------------
  * Locals
  */
 
@@ -67,13 +67,13 @@ static __m128i mask_blue;
 static __m128i mask_565_fix_rb;
 static __m128i mask_565_fix_g;
 
-/* -------------------------------------------------------------------------------------------------
+/* ----------------------------------------------------------------------
  * SSE2 Inlines
  */
 static force_inline __m128i
 unpack_32_1x128 (uint32_t data)
 {
-    return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128());
+    return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
 }
 
 static force_inline void
@@ -87,7 +87,7 @@ static force_inline __m128i
 unpack_565_to_8888 (__m128i lo)
 {
     __m128i r, g, b, rb, t;
-    
+
     r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
     g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
     b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
@@ -100,12 +100,16 @@ unpack_565_to_8888 (__m128i lo)
     t  = _mm_and_si128 (g, mask_565_fix_g);
     t  = _mm_srli_epi32 (t, 6);
     g  = _mm_or_si128 (g, t);
-    
+
     return _mm_or_si128 (rb, g);
 }
 
 static force_inline void
-unpack_565_128_4x128 (__m128i data, __m128i* data0, __m128i* data1, __m128i* data2, __m128i* data3)
+unpack_565_128_4x128 (__m128i  data,
+                      __m128i* data0,
+                      __m128i* data1,
+                      __m128i* data2,
+                      __m128i* data3)
 {
     __m128i lo, hi;
 
@@ -122,7 +126,9 @@ unpack_565_128_4x128 (__m128i data, __m128i* data0, __m128i* data1, __m128i* dat
 static force_inline uint16_t
 pack_565_32_16 (uint32_t pixel)
 {
-    return (uint16_t) (((pixel>>8) & 0xf800) | ((pixel>>5) & 0x07e0) | ((pixel>>3) & 0x001f));
+    return (uint16_t) (((pixel >> 8) & 0xf800) |
+                      ((pixel >> 5) & 0x07e0) |
+                      ((pixel >> 3) & 0x001f));
 }
 
 static force_inline __m128i
@@ -137,12 +143,12 @@ pack_565_2x128_128 (__m128i lo, __m128i hi)
     __m128i data;
     __m128i r, g1, g2, b;
 
-    data = pack_2x128_128 ( lo, hi );
+    data = pack_2x128_128 (lo, hi);
 
-    r  = _mm_and_si128 (data , mask_565_r);
-    g1 = _mm_and_si128 (_mm_slli_epi32 (data , 3), mask_565_g1);
-    g2 = _mm_and_si128 (_mm_srli_epi32 (data , 5), mask_565_g2);
-    b  = _mm_and_si128 (_mm_srli_epi32 (data , 3), mask_565_b);
+    r  = _mm_and_si128 (data, mask_565_r);
+    g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
+    g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
+    b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
 
     return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
 }
@@ -150,64 +156,82 @@ pack_565_2x128_128 (__m128i lo, __m128i hi)
 static force_inline __m128i
 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
 {
-    return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1), pack_565_2x128_128 (*xmm2, *xmm3));
+    return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
+                            pack_565_2x128_128 (*xmm2, *xmm3));
 }
 
 static force_inline int
 is_opaque (__m128i x)
 {
     __m128i ffs = _mm_cmpeq_epi8 (x, x);
+
     return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
 }
 
 static force_inline int
 is_zero (__m128i x)
 {
-    return _mm_movemask_epi8 (_mm_cmpeq_epi8 (x, _mm_setzero_si128())) == 0xffff;
+    return _mm_movemask_epi8 (
+       _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
 }
 
 static force_inline int
 is_transparent (__m128i x)
 {
-    return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, _mm_setzero_si128())) & 0x8888) == 0x8888;
+    return (_mm_movemask_epi8 (
+               _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
 }
 
 static force_inline __m128i
 expand_pixel_32_1x128 (uint32_t data)
 {
-    return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE(1, 0, 1, 0));
+    return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
 }
 
 static force_inline __m128i
 expand_alpha_1x128 (__m128i data)
 {
-    return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
+    return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
+                                                    _MM_SHUFFLE (3, 3, 3, 3)),
+                               _MM_SHUFFLE (3, 3, 3, 3));
 }
 
 static force_inline void
-expand_alpha_2x128 (__m128i data_lo, __m128i data_hi, __m128i* alpha_lo, __m128i* alpha_hi)
+expand_alpha_2x128 (__m128i  data_lo,
+                    __m128i  data_hi,
+                    __m128i* alpha_lo,
+                    __m128i* alpha_hi)
 {
     __m128i lo, hi;
 
-    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE(3, 3, 3, 3));
-    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE(3, 3, 3, 3));
-    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(3, 3, 3, 3));
-    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(3, 3, 3, 3));
+    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
+    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
+
+    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
+    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
 }
 
 static force_inline void
-expand_alpha_rev_2x128 (__m128i data_lo, __m128i data_hi, __m128i* alpha_lo, __m128i* alpha_hi)
+expand_alpha_rev_2x128 (__m128i  data_lo,
+                        __m128i  data_hi,
+                        __m128i* alpha_lo,
+                        __m128i* alpha_hi)
 {
     __m128i lo, hi;
 
-    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE(0, 0, 0, 0));
-    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE(0, 0, 0, 0));
-    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(0, 0, 0, 0));
-    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(0, 0, 0, 0));
+    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
+    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
+    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
+    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
 }
 
 static force_inline void
-pix_multiply_2x128 (__m128i* data_lo, __m128i* data_hi, __m128i* alpha_lo, __m128i* alpha_hi, __m128i* ret_lo, __m128i* ret_hi)
+pix_multiply_2x128 (__m128i* data_lo,
+                    __m128i* data_hi,
+                    __m128i* alpha_lo,
+                    __m128i* alpha_hi,
+                    __m128i* ret_lo,
+                    __m128i* ret_hi)
 {
     __m128i lo, hi;
 
@@ -220,9 +244,16 @@ pix_multiply_2x128 (__m128i* data_lo, __m128i* data_hi, __m128i* alpha_lo, __m12
 }
 
 static force_inline void
-pix_add_multiply_2x128 (__m128i* src_lo, __m128i* src_hi, __m128i* alpha_dst_lo, __m128i* alpha_dst_hi,
-                      __m128i* dst_lo, __m128i* dst_hi, __m128i* alpha_src_lo, __m128i* alpha_src_hi,
-                      __m128i* ret_lo, __m128i* ret_hi)
+pix_add_multiply_2x128 (__m128i* src_lo,
+                        __m128i* src_hi,
+                        __m128i* alpha_dst_lo,
+                        __m128i* alpha_dst_hi,
+                        __m128i* dst_lo,
+                        __m128i* dst_hi,
+                        __m128i* alpha_src_lo,
+                        __m128i* alpha_src_hi,
+                        __m128i* ret_lo,
+                        __m128i* ret_hi)
 {
     __m128i lo, hi;
     __m128i mul_lo, mul_hi;
@@ -240,25 +271,36 @@ pix_add_multiply_2x128 (__m128i* src_lo, __m128i* src_hi, __m128i* alpha_dst_lo,
 }
 
 static force_inline void
-negate_2x128 (__m128i data_lo, __m128i data_hi, __m128i* neg_lo, __m128i* neg_hi)
+negate_2x128 (__m128i  data_lo,
+              __m128i  data_hi,
+              __m128i* neg_lo,
+              __m128i* neg_hi)
 {
     *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
     *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
 }
 
 static force_inline void
-invert_colors_2x128 (__m128i data_lo, __m128i data_hi, __m128i* inv_lo, __m128i* inv_hi)
+invert_colors_2x128 (__m128i  data_lo,
+                     __m128i  data_hi,
+                     __m128i* inv_lo,
+                     __m128i* inv_hi)
 {
     __m128i lo, hi;
 
-    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE(3, 0, 1, 2));
-    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE(3, 0, 1, 2));
-    *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(3, 0, 1, 2));
-    *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(3, 0, 1, 2));
+    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
+    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
+    *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
+    *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
 }
 
 static force_inline void
-over_2x128 (__m128i* src_lo, __m128i* src_hi, __m128i* alpha_lo, __m128i* alpha_hi, __m128i* dst_lo, __m128i* dst_hi)
+over_2x128 (__m128i* src_lo,
+            __m128i* src_hi,
+            __m128i* alpha_lo,
+            __m128i* alpha_hi,
+            __m128i* dst_lo,
+            __m128i* dst_hi)
 {
     __m128i t1, t2;
 
@@ -271,7 +313,10 @@ over_2x128 (__m128i* src_lo, __m128i* src_hi, __m128i* alpha_lo, __m128i* alpha_
 }
 
 static force_inline void
-over_rev_non_pre_2x128 (__m128i src_lo, __m128i src_hi, __m128i* dst_lo, __m128i* dst_hi)
+over_rev_non_pre_2x128 (__m128i  src_lo,
+                        __m128i  src_hi,
+                        __m128i* dst_lo,
+                        __m128i* dst_hi)
 {
     __m128i lo, hi;
     __m128i alpha_lo, alpha_hi;
@@ -289,13 +334,19 @@ over_rev_non_pre_2x128 (__m128i src_lo, __m128i src_hi, __m128i* dst_lo, __m128i
 }
 
 static force_inline void
-in_over_2x128 (__m128i* src_lo,  __m128i* src_hi,  __m128i*  alpha_lo, __m128i*  alpha_hi,
-              __m128i* mask_lo, __m128i* mask_hi, __m128i* dst_lo,   __m128i* dst_hi)
+in_over_2x128 (__m128i* src_lo,
+               __m128i* src_hi,
+               __m128i* alpha_lo,
+               __m128i* alpha_hi,
+               __m128i* mask_lo,
+               __m128i* mask_hi,
+               __m128i* dst_lo,
+               __m128i* dst_hi)
 {
     __m128i s_lo, s_hi;
     __m128i a_lo, a_hi;
 
-    pix_multiply_2x128 (  src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
+    pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
     pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
 
     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
@@ -327,57 +378,64 @@ load_128_unaligned (const __m128i* src)
     return _mm_loadu_si128 (src);
 }
 
-/* save 4 pixels using Write Combining memory on a 16-byte boundary aligned address */
+/* save 4 pixels using Write Combining memory on a 16-byte
+ * boundary aligned address
+ */
 static force_inline void
-save_128_write_combining (__m128i* dst, __m128i data)
+save_128_write_combining (__m128i* dst,
+                          __m128i  data)
 {
     _mm_stream_si128 (dst, data);
 }
 
 /* save 4 pixels on a 16-byte boundary aligned address */
 static force_inline void
-save_128_aligned (__m128i* dst, __m128i data)
+save_128_aligned (__m128i* dst,
+                  __m128i  data)
 {
     _mm_store_si128 (dst, data);
 }
 
 /* save 4 pixels on a unaligned address */
 static force_inline void
-save_128_unaligned (__m128i* dst, __m128i data)
+save_128_unaligned (__m128i* dst,
+                    __m128i  data)
 {
     _mm_storeu_si128 (dst, data);
 }
 
-/* -------------------------------------------------------------------------------------------------
+/* ------------------------------------------------------------------
  * MMX inlines
  */
 
 static force_inline __m64
 unpack_32_1x64 (uint32_t data)
 {
-    return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64());
+    return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64 ());
 }
 
 static force_inline __m64
 expand_alpha_1x64 (__m64 data)
 {
-    return _mm_shuffle_pi16 (data, _MM_SHUFFLE(3, 3, 3, 3));
+    return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
 }
 
 static force_inline __m64
 expand_alpha_rev_1x64 (__m64 data)
 {
-    return _mm_shuffle_pi16 (data, _MM_SHUFFLE(0, 0, 0, 0));
+    return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
 }
 
 static force_inline __m64
 expand_pixel_8_1x64 (uint8_t data)
 {
-    return _mm_shuffle_pi16 (unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE(0, 0, 0, 0));
+    return _mm_shuffle_pi16 (
+       unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
 }
 
 static force_inline __m64
-pix_multiply_1x64 (__m64 data, __m64 alpha)
+pix_multiply_1x64 (__m64 data,
+                   __m64 alpha)
 {
     return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
                                           mask_x0080),
@@ -385,12 +443,16 @@ pix_multiply_1x64 (__m64 data, __m64 alpha)
 }
 
 static force_inline __m64
-pix_add_multiply_1x64 (__m64* src, __m64* alpha_dst, __m64* dst, __m64* alpha_src)
+pix_add_multiply_1x64 (__m64* src,
+                       __m64* alpha_dst,
+                       __m64* dst,
+                       __m64* alpha_src)
 {
-    return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (*src, *alpha_dst),
-                                                         mask_x0080),
-                                          _mm_mullo_pi16 (*dst, *alpha_src)),
-                           mask_x0101);
+    return _mm_mulhi_pu16 (
+       _mm_adds_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (*src, *alpha_dst),
+                                     mask_x0080),
+                      _mm_mullo_pi16 (*dst, *alpha_src)),
+       mask_x0101);
 }
 
 static force_inline __m64
@@ -402,7 +464,7 @@ negate_1x64 (__m64 data)
 static force_inline __m64
 invert_colors_1x64 (__m64 data)
 {
-    return _mm_shuffle_pi16 (data, _MM_SHUFFLE(3, 0, 1, 2));
+    return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
 }
 
 static force_inline __m64
@@ -425,15 +487,15 @@ over_rev_non_pre_1x64 (__m64 src, __m64 dst)
     __m64 alpha = expand_alpha_1x64 (src);
 
     return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
-                                        _mm_or_si64 (alpha, mask_x_alpha)),
+                                         _mm_or_si64 (alpha, mask_x_alpha)),
                       alpha,
                       dst);
 }
 
 static force_inline uint32_t
-pack_1x64_32( __m64 data )
+pack_1x64_32 (__m64 data)
 {
-    return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64()));
+    return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
 }
 
 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
@@ -469,25 +531,26 @@ expand565_16_1x64 (uint16_t pixel)
     return _mm_srli_pi16 (p, 8);
 }
 
-/* -------------------------------------------------------------------------------------------------
+/* ----------------------------------------------------------------------------
  * Compose Core transformations
  */
 static force_inline uint32_t
 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
 {
-    uint8_t     a;
-    __m64       ms;
+    uint8_t a;
+    __m64 ms;
 
     a = src >> 24;
 
     if (a == 0xff)
     {
-        return src;
+       return src;
     }
     else if (src)
     {
-        ms = unpack_32_1x64 (src);
-        return pack_1x64_32 (over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
+       ms = unpack_32_1x64 (src);
+       return pack_1x64_32 (
+           over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
     }
 
     return dst;
@@ -504,7 +567,7 @@ combine1 (const uint32_t *ps, const uint32_t *pm)
 
        mm = unpack_32_1x64 (*pm);
        mm = expand_alpha_1x64 (mm);
-       
+
        ms = unpack_32_1x64 (s);
        ms = pix_multiply_1x64 (ms, mm);
 
@@ -520,7 +583,7 @@ combine4 (const __m128i *ps, const __m128i *pm)
     __m128i xmm_src_lo, xmm_src_hi;
     __m128i xmm_msk_lo, xmm_msk_hi;
     __m128i s;
-    
+
     if (pm)
     {
        xmm_msk_lo = load_128_unaligned (pm);
@@ -528,18 +591,20 @@ combine4 (const __m128i *ps, const __m128i *pm)
        if (is_transparent (xmm_msk_lo))
            return _mm_setzero_si128 ();
     }
-    
+
     s = load_128_unaligned (ps);
-       
+
     if (pm)
     {
        unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
        unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
-       
+
        expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
-       
-       pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_msk_lo, &xmm_msk_hi, &xmm_src_lo, &xmm_src_hi);
-       
+
+       pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+                           &xmm_msk_lo, &xmm_msk_hi,
+                           &xmm_src_lo, &xmm_src_hi);
+
        s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
     }
 
@@ -547,7 +612,10 @@ combine4 (const __m128i *ps, const __m128i *pm)
 }
 
 static force_inline void
-core_combine_over_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
+core_combine_over_u_sse2 (uint32_t*       pd,
+                          const uint32_t* ps,
+                          const uint32_t* pm,
+                          int             w)
 {
     uint32_t s, d;
 
@@ -564,14 +632,14 @@ core_combine_over_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm,
     while (w &&
            ((unsigned long)pd & 15))
     {
-        d = *pd;
-        s = combine1 (ps, pm);
+       d = *pd;
+       s = combine1 (ps, pm);
 
-        *pd++ = core_combine_over_u_pixel_sse2 (s, d);
+       *pd++ = core_combine_over_u_pixel_sse2 (s, d);
        ps++;
        if (pm)
            pm++;
-        w--;
+       w--;
     }
 
     /* call prefetch hint to optimize cache load*/
@@ -581,55 +649,65 @@ core_combine_over_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm,
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cache_prefetch_next ((__m128i*)ps);
-        cache_prefetch_next ((__m128i*)pd);
+       /* fill cache line with next memory */
+       cache_prefetch_next ((__m128i*)ps);
+       cache_prefetch_next ((__m128i*)pd);
        cache_prefetch_next ((__m128i*)pm);
 
-        /* I'm loading unaligned because I'm not sure about the address alignment. */
-        xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
-
-        if (is_opaque (xmm_src_hi))
-        {
-            save_128_aligned ((__m128i*)pd, xmm_src_hi);
-        }
-        else if (!is_zero (xmm_src_hi))
-        {
-            xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-
-            unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-            unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
-            expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
-
-            over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_dst_lo, &xmm_dst_hi);
-
-            /* rebuid the 4 pixel data and save*/
-            save_128_aligned ((__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-        }
-
-        w -= 4;
-        ps += 4;
-        pd += 4;
+       /* I'm loading unaligned because I'm not sure about
+        * the address alignment.
+        */
+       xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+
+       if (is_opaque (xmm_src_hi))
+       {
+           save_128_aligned ((__m128i*)pd, xmm_src_hi);
+       }
+       else if (!is_zero (xmm_src_hi))
+       {
+           xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+           unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+           unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+           expand_alpha_2x128 (
+               xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+
+           over_2x128 (&xmm_src_lo, &xmm_src_hi,
+                       &xmm_alpha_lo, &xmm_alpha_hi,
+                       &xmm_dst_lo, &xmm_dst_hi);
+
+           /* rebuid the 4 pixel data and save*/
+           save_128_aligned ((__m128i*)pd,
+                             pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+       }
+
+       w -= 4;
+       ps += 4;
+       pd += 4;
        if (pm)
            pm += 4;
     }
 
     while (w)
     {
-        d = *pd;
-        s = combine1 (ps, pm);
+       d = *pd;
+       s = combine1 (ps, pm);
 
-        *pd++ = core_combine_over_u_pixel_sse2 (s, d);
+       *pd++ = core_combine_over_u_pixel_sse2 (s, d);
        ps++;
        if (pm)
            pm++;
-        w--;
+
+       w--;
     }
 }
 
 static force_inline void
-core_combine_over_reverse_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
+core_combine_over_reverse_u_sse2 (uint32_t*       pd,
+                                  const uint32_t* ps,
+                                  const uint32_t* pm,
+                                  int             w)
 {
     uint32_t s, d;
 
@@ -646,11 +724,11 @@ core_combine_over_reverse_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32
     while (w &&
            ((unsigned long)pd & 15))
     {
-        d = *pd;
-        s = combine1 (ps, pm);
+       d = *pd;
+       s = combine1 (ps, pm);
 
-        *pd++ = core_combine_over_u_pixel_sse2 (d, s);
-        w--;
+       *pd++ = core_combine_over_u_pixel_sse2 (d, s);
+       w--;
        ps++;
        if (pm)
            pm++;
@@ -663,40 +741,47 @@ core_combine_over_reverse_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cache_prefetch_next ((__m128i*)ps);
-        cache_prefetch_next ((__m128i*)pd);
+       /* fill cache line with next memory */
+       cache_prefetch_next ((__m128i*)ps);
+       cache_prefetch_next ((__m128i*)pd);
        cache_prefetch_next ((__m128i*)pm);
 
-        /* I'm loading unaligned because I'm not sure about the address alignment. */
-        xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
-        xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+       /* I'm loading unaligned because I'm not sure
+        * about the address alignment.
+        */
+       xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+       xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
-        unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-        unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 
-        expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+       expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+                           &xmm_alpha_lo, &xmm_alpha_hi);
 
-        over_2x128 (&xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_src_lo, &xmm_src_hi);
+       over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+                   &xmm_alpha_lo, &xmm_alpha_hi,
+                   &xmm_src_lo, &xmm_src_hi);
 
-        /* rebuid the 4 pixel data and save*/
-        save_128_aligned ((__m128i*)pd, pack_2x128_128 (xmm_src_lo, xmm_src_hi));
+       /* rebuid the 4 pixel data and save*/
+       save_128_aligned ((__m128i*)pd,
+                         pack_2x128_128 (xmm_src_lo, xmm_src_hi));
+
+       w -= 4;
+       ps += 4;
+       pd += 4;
 
-        w -= 4;
-        ps += 4;
-        pd += 4;
        if (pm)
            pm += 4;
     }
 
     while (w)
     {
-        d = *pd;
-        s = combine1 (ps, pm);
+       d = *pd;
+       s = combine1 (ps, pm);
 
-        *pd++ = core_combine_over_u_pixel_sse2 (d, s);
+       *pd++ = core_combine_over_u_pixel_sse2 (d, s);
        ps++;
-        w--;
+       w--;
        if (pm)
            pm++;
     }
@@ -709,18 +794,23 @@ core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
 
     if (maska == 0)
     {
-        return 0;
+       return 0;
     }
     else if (maska != 0xff)
     {
-        return pack_1x64_32(pix_multiply_1x64 (unpack_32_1x64 (dst), expand_alpha_1x64 (unpack_32_1x64 (src))));
+       return pack_1x64_32 (
+           pix_multiply_1x64 (unpack_32_1x64 (dst),
+                              expand_alpha_1x64 (unpack_32_1x64 (src))));
     }
 
     return dst;
 }
 
 static force_inline void
-core_combine_in_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
+core_combine_in_u_sse2 (uint32_t*       pd,
+                        const uint32_t* ps,
+                        const uint32_t* pm,
+                        int             w)
 {
     uint32_t s, d;
 
@@ -734,11 +824,11 @@ core_combine_in_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, in
 
     while (w && ((unsigned long) pd & 15))
     {
-        s = combine1 (ps, pm);
-        d = *pd;
+       s = combine1 (ps, pm);
+       d = *pd;
 
-        *pd++ = core_combine_in_u_pixelsse2 (d, s);
-        w--;
+       *pd++ = core_combine_in_u_pixelsse2 (d, s);
+       w--;
        ps++;
        if (pm)
            pm++;
@@ -751,36 +841,39 @@ core_combine_in_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, in
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cache_prefetch_next ((__m128i*)ps);
-        cache_prefetch_next ((__m128i*)pd);
+       /* fill cache line with next memory */
+       cache_prefetch_next ((__m128i*)ps);
+       cache_prefetch_next ((__m128i*)pd);
        cache_prefetch_next ((__m128i*)pm);
 
-        xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-        xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
+       xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+       xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
 
-        unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-        expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 
-        unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-        pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_dst_lo, &xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+                           &xmm_dst_lo, &xmm_dst_hi,
+                           &xmm_dst_lo, &xmm_dst_hi);
 
-        save_128_aligned ((__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+       save_128_aligned ((__m128i*)pd,
+                         pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 
-        ps += 4;
-        pd += 4;
-        w -= 4;
+       ps += 4;
+       pd += 4;
+       w -= 4;
        if (pm)
            pm += 4;
     }
 
     while (w)
     {
-        s = combine1 (ps, pm);
-        d = *pd;
+       s = combine1 (ps, pm);
+       d = *pd;
 
-        *pd++ = core_combine_in_u_pixelsse2 (d, s);
-        w--;
+       *pd++ = core_combine_in_u_pixelsse2 (d, s);
+       w--;
        ps++;
        if (pm)
            pm++;
@@ -788,7 +881,10 @@ core_combine_in_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, in
 }
 
 static force_inline void
-core_combine_reverse_in_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
+core_combine_reverse_in_u_sse2 (uint32_t*       pd,
+                                const uint32_t* ps,
+                                const uint32_t *pm,
+                                int             w)
 {
     uint32_t s, d;
 
@@ -802,12 +898,12 @@ core_combine_reverse_in_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t
 
     while (w && ((unsigned long) pd & 15))
     {
-        s = combine1 (ps, pm);
-        d = *pd;
+       s = combine1 (ps, pm);
+       d = *pd;
 
-        *pd++ = core_combine_in_u_pixelsse2 (s, d);
+       *pd++ = core_combine_in_u_pixelsse2 (s, d);
        ps++;
-        w--;
+       w--;
        if (pm)
            pm++;
     }
@@ -819,36 +915,39 @@ core_combine_reverse_in_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cache_prefetch_next ((__m128i*)ps);
-        cache_prefetch_next ((__m128i*)pd);
+       /* fill cache line with next memory */
+       cache_prefetch_next ((__m128i*)ps);
+       cache_prefetch_next ((__m128i*)pd);
        cache_prefetch_next ((__m128i*)pm);
 
-        xmm_dst_hi = load_128_aligned ((__m128i*) pd);
-        xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
+       xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+       xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
 
-        unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-        expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 
-        unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-        pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, &xmm_src_lo, &xmm_src_hi, &xmm_dst_lo, &xmm_dst_hi);
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+                           &xmm_src_lo, &xmm_src_hi,
+                           &xmm_dst_lo, &xmm_dst_hi);
 
-        save_128_aligned ((__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 
-        ps += 4;
-        pd += 4;
-        w -= 4;
+       ps += 4;
+       pd += 4;
+       w -= 4;
        if (pm)
            pm += 4;
     }
 
     while (w)
     {
-        s = combine1 (ps, pm);
-        d = *pd;
+       s = combine1 (ps, pm);
+       d = *pd;
 
-        *pd++ = core_combine_in_u_pixelsse2 (s, d);
-        w--;
+       *pd++ = core_combine_in_u_pixelsse2 (s, d);
+       w--;
        ps++;
        if (pm)
            pm++;
@@ -856,7 +955,10 @@ core_combine_reverse_in_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t
 }
 
 static force_inline void
-core_combine_reverse_out_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
+core_combine_reverse_out_u_sse2 (uint32_t*       pd,
+                                 const uint32_t* ps,
+                                 const uint32_t* pm,
+                                 int             w)
 {
     /* call prefetch hint to optimize cache load*/
     cache_prefetch ((__m128i*)ps);
@@ -865,14 +967,18 @@ core_combine_reverse_out_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_
 
     while (w && ((unsigned long) pd & 15))
     {
-        uint32_t s = combine1 (ps, pm);
-        uint32_t d = *pd;
+       uint32_t s = combine1 (ps, pm);
+       uint32_t d = *pd;
 
-        *pd++ = pack_1x64_32 (pix_multiply_1x64 (unpack_32_1x64 (d), negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (s)))));
+       *pd++ = pack_1x64_32 (
+           pix_multiply_1x64 (
+               unpack_32_1x64 (d), negate_1x64 (
+                   expand_alpha_1x64 (unpack_32_1x64 (s)))));
+       
        if (pm)
            pm++;
        ps++;
-        w--;
+       w--;
     }
 
     /* call prefetch hint to optimize cache load*/
@@ -882,49 +988,59 @@ core_combine_reverse_out_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_
 
     while (w >= 4)
     {
-        __m128i xmm_src_lo, xmm_src_hi;
-        __m128i xmm_dst_lo, xmm_dst_hi;
+       __m128i xmm_src_lo, xmm_src_hi;
+       __m128i xmm_dst_lo, xmm_dst_hi;
 
-        /* fill cache line with next memory */
-        cache_prefetch_next ((__m128i*)ps);
-        cache_prefetch_next ((__m128i*)pd);
+       /* fill cache line with next memory */
+       cache_prefetch_next ((__m128i*)ps);
+       cache_prefetch_next ((__m128i*)pd);
        cache_prefetch_next ((__m128i*)pm);
 
-        xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
-        xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+       xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+       xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
-        unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-        unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 
-        expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-        negate_2x128      (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 
-        pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, &xmm_src_lo, &xmm_src_hi, &xmm_dst_lo, &xmm_dst_hi);
+       pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+                           &xmm_src_lo, &xmm_src_hi,
+                           &xmm_dst_lo, &xmm_dst_hi);
 
-        save_128_aligned ((__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 
-        ps += 4;
-        pd += 4;
+       ps += 4;
+       pd += 4;
        if (pm)
            pm += 4;
-        w -= 4;
+
+       w -= 4;
     }
 
     while (w)
     {
-        uint32_t s = combine1 (ps, pm);
-        uint32_t d = *pd;
+       uint32_t s = combine1 (ps, pm);
+       uint32_t d = *pd;
 
-        *pd++ = pack_1x64_32 (pix_multiply_1x64 (unpack_32_1x64 (d), negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (s)))));
+       *pd++ = pack_1x64_32 (
+           pix_multiply_1x64 (
+               unpack_32_1x64 (d), negate_1x64 (
+                   expand_alpha_1x64 (unpack_32_1x64 (s)))));
        ps++;
        if (pm)
            pm++;
-        w--;
+       w--;
     }
 }
 
 static force_inline void
-core_combine_out_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
+core_combine_out_u_sse2 (uint32_t*       pd,
+                         const uint32_t* ps,
+                         const uint32_t* pm,
+                         int             w)
 {
     /* call prefetch hint to optimize cache load*/
     cache_prefetch ((__m128i*)ps);
@@ -933,11 +1049,14 @@ core_combine_out_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, i
 
     while (w && ((unsigned long) pd & 15))
     {
-        uint32_t s = combine1 (ps, pm);
-        uint32_t d = *pd;
-
-        *pd++ = pack_1x64_32 (pix_multiply_1x64 (unpack_32_1x64 (s), negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
-        w--;
+       uint32_t s = combine1 (ps, pm);
+       uint32_t d = *pd;
+
+       *pd++ = pack_1x64_32 (
+           pix_multiply_1x64 (
+               unpack_32_1x64 (s), negate_1x64 (
+                   expand_alpha_1x64 (unpack_32_1x64 (d)))));
+       w--;
        ps++;
        if (pm)
            pm++;
@@ -950,41 +1069,47 @@ core_combine_out_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, i
 
     while (w >= 4)
     {
-        __m128i xmm_src_lo, xmm_src_hi;
-        __m128i xmm_dst_lo, xmm_dst_hi;
+       __m128i xmm_src_lo, xmm_src_hi;
+       __m128i xmm_dst_lo, xmm_dst_hi;
 
-        /* fill cache line with next memory */
-        cache_prefetch_next ((__m128i*)ps);
-        cache_prefetch_next ((__m128i*)pd);
+       /* fill cache line with next memory */
+       cache_prefetch_next ((__m128i*)ps);
+       cache_prefetch_next ((__m128i*)pd);
        cache_prefetch_next ((__m128i*)pm);
 
-        xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
-        xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+       xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
+       xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
-        unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-        unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 
-        expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-        negate_2x128      (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 
-        pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_dst_lo, &xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+                           &xmm_dst_lo, &xmm_dst_hi,
+                           &xmm_dst_lo, &xmm_dst_hi);
 
-        save_128_aligned ((__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 
-        ps += 4;
-        pd += 4;
-        w -= 4;
+       ps += 4;
+       pd += 4;
+       w -= 4;
        if (pm)
            pm += 4;
     }
 
     while (w)
     {
-        uint32_t s = combine1 (ps, pm);
-        uint32_t d = *pd;
-
-        *pd++ = pack_1x64_32 (pix_multiply_1x64 (unpack_32_1x64 (s), negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
-        w--;
+       uint32_t s = combine1 (ps, pm);
+       uint32_t d = *pd;
+
+       *pd++ = pack_1x64_32 (
+           pix_multiply_1x64 (
+               unpack_32_1x64 (s), negate_1x64 (
+                   expand_alpha_1x64 (unpack_32_1x64 (d)))));
+       w--;
        ps++;
        if (pm)
            pm++;
@@ -992,7 +1117,8 @@ core_combine_out_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, i
 }
 
 static force_inline uint32_t
-core_combine_atop_u_pixel_sse2 (uint32_t src, uint32_t dst)
+core_combine_atop_u_pixel_sse2 (uint32_t src,
+                                uint32_t dst)
 {
     __m64 s = unpack_32_1x64 (src);
     __m64 d = unpack_32_1x64 (dst);
@@ -1004,7 +1130,10 @@ core_combine_atop_u_pixel_sse2 (uint32_t src, uint32_t dst)
 }
 
 static force_inline void
-core_combine_atop_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
+core_combine_atop_u_sse2 (uint32_t*       pd,
+                          const uint32_t* ps,
+                          const uint32_t* pm,
+                          int             w)
 {
     uint32_t s, d;
 
@@ -1020,11 +1149,11 @@ core_combine_atop_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm,
 
     while (w && ((unsigned long) pd & 15))
     {
-        s = combine1 (ps, pm);
-        d = *pd;
+       s = combine1 (ps, pm);
+       d = *pd;
 
-        *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
-        w--;
+       *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
+       w--;
        ps++;
        if (pm)
            pm++;
@@ -1037,42 +1166,47 @@ core_combine_atop_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm,
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cache_prefetch_next ((__m128i*)ps);
-        cache_prefetch_next ((__m128i*)pd);
+       /* fill cache line with next memory */
+       cache_prefetch_next ((__m128i*)ps);
+       cache_prefetch_next ((__m128i*)pd);
        cache_prefetch_next ((__m128i*)pm);
 
-        xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
-        xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+       xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+       xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
-        unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-        unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 
-        expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi);
-        expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+       expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+                           &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+       expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+                           &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
 
-        negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+       negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
+                     &xmm_alpha_src_lo, &xmm_alpha_src_hi);
 
-        pix_add_multiply_2x128 ( &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
-                               &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
-                               &xmm_dst_lo, &xmm_dst_hi );
+       pix_add_multiply_2x128 (
+           &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+           &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+           &xmm_dst_lo, &xmm_dst_hi);
 
-        save_128_aligned ((__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 
-        ps += 4;
-        pd += 4;
-        w -= 4;
+       ps += 4;
+       pd += 4;
+       w -= 4;
        if (pm)
            pm += 4;
     }
 
     while (w)
     {
-        s = combine1 (ps, pm);
-        d = *pd;
+       s = combine1 (ps, pm);
+       d = *pd;
 
-        *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
-        w--;
+       *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
+       w--;
        ps++;
        if (pm)
            pm++;
@@ -1080,7 +1214,8 @@ core_combine_atop_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm,
 }
 
 static force_inline uint32_t
-core_combine_reverse_atop_u_pixel_sse2 (uint32_t src, uint32_t dst)
+core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
+                                        uint32_t dst)
 {
     __m64 s = unpack_32_1x64 (src);
     __m64 d = unpack_32_1x64 (dst);
@@ -1092,7 +1227,10 @@ core_combine_reverse_atop_u_pixel_sse2 (uint32_t src, uint32_t dst)
 }
 
 static force_inline void
-core_combine_reverse_atop_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
+core_combine_reverse_atop_u_sse2 (uint32_t*       pd,
+                                  const uint32_t* ps,
+                                  const uint32_t* pm,
+                                  int             w)
 {
     uint32_t s, d;
 
@@ -1108,12 +1246,12 @@ core_combine_reverse_atop_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32
 
     while (w && ((unsigned long) pd & 15))
     {
-        s = combine1 (ps, pm);
-        d = *pd;
+       s = combine1 (ps, pm);
+       d = *pd;
 
-        *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
+       *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
        ps++;
-        w--;
+       w--;
        if (pm)
            pm++;
     }
@@ -1125,50 +1263,56 @@ core_combine_reverse_atop_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cache_prefetch_next ((__m128i*)ps);
-        cache_prefetch_next ((__m128i*)pd);
+       /* fill cache line with next memory */
+       cache_prefetch_next ((__m128i*)ps);
+       cache_prefetch_next ((__m128i*)pd);
        cache_prefetch_next ((__m128i*)pm);
 
-        xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
-        xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+       xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+       xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
-        unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-        unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 
-        expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi);
-        expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+       expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+                           &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+       expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+                           &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
 
-        negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+       negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+                     &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
 
-        pix_add_multiply_2x128 ( &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
-                               &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
-                               &xmm_dst_lo, &xmm_dst_hi );
+       pix_add_multiply_2x128 (
+           &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+           &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+           &xmm_dst_lo, &xmm_dst_hi);
 
-        save_128_aligned ((__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 
-        ps += 4;
-        pd += 4;
-        w -= 4;
+       ps += 4;
+       pd += 4;
+       w -= 4;
        if (pm)
            pm += 4;
     }
 
     while (w)
     {
-        s = combine1 (ps, pm);
-        d = *pd;
+       s = combine1 (ps, pm);
+       d = *pd;
 
-        *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
+       *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
        ps++;
-        w--;
+       w--;
        if (pm)
            pm++;
     }
 }
 
 static force_inline uint32_t
-core_combine_xor_u_pixel_sse2 (uint32_t src, uint32_t dst)
+core_combine_xor_u_pixel_sse2 (uint32_t src,
+                               uint32_t dst)
 {
     __m64 s = unpack_32_1x64 (src);
     __m64 d = unpack_32_1x64 (dst);
@@ -1180,14 +1324,17 @@ core_combine_xor_u_pixel_sse2 (uint32_t src, uint32_t dst)
 }
 
 static force_inline void
-core_combine_xor_u_sse2 (uint32_t* dst, const uint32_t* src, const uint32_t *mask, int width)
+core_combine_xor_u_sse2 (uint32_t*       dst,
+                         const uint32_t* src,
+                         const uint32_t *mask,
+                         int             width)
 {
     int w = width;
     uint32_t s, d;
     uint32_t* pd = dst;
     const uint32_t* ps = src;
     const uint32_t* pm = mask;
-    
+
     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
@@ -1200,11 +1347,11 @@ core_combine_xor_u_sse2 (uint32_t* dst, const uint32_t* src, const uint32_t *mas
 
     while (w && ((unsigned long) pd & 15))
     {
-        s = combine1 (ps, pm);
-        d = *pd;
+       s = combine1 (ps, pm);
+       d = *pd;
 
-        *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
-        w--;
+       *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
+       w--;
        ps++;
        if (pm)
            pm++;
@@ -1217,43 +1364,49 @@ core_combine_xor_u_sse2 (uint32_t* dst, const uint32_t* src, const uint32_t *mas
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cache_prefetch_next ((__m128i*)ps);
-        cache_prefetch_next ((__m128i*)pd);
+       /* fill cache line with next memory */
+       cache_prefetch_next ((__m128i*)ps);
+       cache_prefetch_next ((__m128i*)pd);
        cache_prefetch_next ((__m128i*)pm);
 
-        xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
-        xmm_dst = load_128_aligned ((__m128i*) pd);
+       xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
+       xmm_dst = load_128_aligned ((__m128i*) pd);
 
-        unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
-        unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+       unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
 
-        expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi);
-        expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+       expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+                           &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+       expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+                           &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
 
-        negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi);
-        negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+       negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
+                     &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+       negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+                     &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
 
-        pix_add_multiply_2x128 ( &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
-                               &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
-                               &xmm_dst_lo, &xmm_dst_hi );
+       pix_add_multiply_2x128 (
+           &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+           &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+           &xmm_dst_lo, &xmm_dst_hi);
 
-        save_128_aligned ((__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 
-        ps += 4;
-        pd += 4;
-        w -= 4;
+       ps += 4;
+       pd += 4;
+       w -= 4;
        if (pm)
            pm += 4;
     }
 
     while (w)
     {
-        s = combine1 (ps, pm);
-        d = *pd;
+       s = combine1 (ps, pm);
+       d = *pd;
 
-        *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
-        w--;
+       *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
+       w--;
        ps++;
        if (pm)
            pm++;
@@ -1261,10 +1414,13 @@ core_combine_xor_u_sse2 (uint32_t* dst, const uint32_t* src, const uint32_t *mas
 }
 
 static force_inline void
-core_combine_add_u_sse2 (uint32_t* dst, const uint32_t* src, const uint32_t* mask, int width)
+core_combine_add_u_sse2 (uint32_t*       dst,
+                         const uint32_t* src,
+                         const uint32_t* mask,
+                         int             width)
 {
     int w = width;
-    uint32_t s,d;
+    uint32_t s, d;
     uint32_t* pd = dst;
     const uint32_t* ps = src;
     const uint32_t* pm = mask;
@@ -1276,13 +1432,15 @@ core_combine_add_u_sse2 (uint32_t* dst, const uint32_t* src, const uint32_t* mas
 
     while (w && (unsigned long)pd & 15)
     {
-        s = combine1 (ps, pm);
-        d = *pd;
+       s = combine1 (ps, pm);
+       d = *pd;
+
        ps++;
        if (pm)
            pm++;
-        *pd++ = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
-        w--;
+       *pd++ = _mm_cvtsi64_si32 (
+           _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
+       w--;
     }
 
     /* call prefetch hint to optimize cache load*/
@@ -1293,36 +1451,40 @@ core_combine_add_u_sse2 (uint32_t* dst, const uint32_t* src, const uint32_t* mas
     while (w >= 4)
     {
        __m128i s;
-       
-        /* fill cache line with next memory */
-        cache_prefetch_next ((__m128i*)ps);
-        cache_prefetch_next ((__m128i*)pd);
+
+       /* fill cache line with next memory */
+       cache_prefetch_next ((__m128i*)ps);
+       cache_prefetch_next ((__m128i*)pd);
        cache_prefetch_next ((__m128i*)pm);
 
-       s = combine4((__m128i*)ps,(__m128i*)pm);
-       
-        save_128_aligned( (__m128i*)pd,
-                        _mm_adds_epu8( s, load_128_aligned  ((__m128i*)pd)) );
-        pd += 4;
-        ps += 4;
+       s = combine4 ((__m128i*)ps, (__m128i*)pm);
+
+       save_128_aligned (
+           (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
+
+       pd += 4;
+       ps += 4;
        if (pm)
            pm += 4;
-        w -= 4;
+       w -= 4;
     }
 
     while (w--)
     {
-        s = combine1 (ps, pm);
-        d = *pd;
+       s = combine1 (ps, pm);
+       d = *pd;
+
        ps++;
-        *pd++ = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
+       *pd++ = _mm_cvtsi64_si32 (
+           _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
        if (pm)
            pm++;
     }
 }
 
 static force_inline uint32_t
-core_combine_saturate_u_pixel_sse2 (uint32_t src, uint32_t dst)
+core_combine_saturate_u_pixel_sse2 (uint32_t src,
+                                    uint32_t dst)
 {
     __m64 ms = unpack_32_1x64 (src);
     __m64 md = unpack_32_1x64 (dst);
@@ -1331,16 +1493,20 @@ core_combine_saturate_u_pixel_sse2 (uint32_t src, uint32_t dst)
 
     if (sa > da)
     {
-        ms = pix_multiply_1x64 (ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8(da, sa) << 24)));
+       ms = pix_multiply_1x64 (
+           ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
     }
 
     return pack_1x64_32 (_mm_adds_pu16 (md, ms));
 }
 
 static force_inline void
-core_combine_saturate_u_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
+core_combine_saturate_u_sse2 (uint32_t *      pd,
+                              const uint32_t *ps,
+                              const uint32_t *pm,
+                              int             w)
 {
-    uint32_t s,d;
+    uint32_t s, d;
 
     uint32_t pack_cmp;
     __m128i xmm_src, xmm_dst;
@@ -1352,10 +1518,11 @@ core_combine_saturate_u_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *
 
     while (w && (unsigned long)pd & 15)
     {
-        s = combine1 (ps, pm);
-        d = *pd;
-        *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
-        w--;
+       s = combine1 (ps, pm);
+       d = *pd;
+
+       *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+       w--;
        ps++;
        if (pm)
            pm++;
@@ -1368,62 +1535,65 @@ core_combine_saturate_u_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cache_prefetch_next ((__m128i*)ps);
-        cache_prefetch_next ((__m128i*)pd);
+       /* fill cache line with next memory */
+       cache_prefetch_next ((__m128i*)ps);
+       cache_prefetch_next ((__m128i*)pd);
        cache_prefetch_next ((__m128i*)pm);
 
-        xmm_dst = load_128_aligned  ((__m128i*)pd);
-        xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
+       xmm_dst = load_128_aligned  ((__m128i*)pd);
+       xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
 
-        pack_cmp = _mm_movemask_epi8 (_mm_cmpgt_epi32 (_mm_srli_epi32 (xmm_src, 24),
-                                                      _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
+       pack_cmp = _mm_movemask_epi8 (
+           _mm_cmpgt_epi32 (
+               _mm_srli_epi32 (xmm_src, 24),
+               _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
 
-        /* if some alpha src is grater than respective ~alpha dst */
-        if (pack_cmp)
-        {
-            s = combine1 (ps++, pm);
-            d = *pd;
-            *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+       /* if some alpha src is grater than respective ~alpha dst */
+       if (pack_cmp)
+       {
+           s = combine1 (ps++, pm);
+           d = *pd;
+           *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
            if (pm)
                pm++;
 
-            s = combine1 (ps++, pm);
-            d = *pd;
-            *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+           s = combine1 (ps++, pm);
+           d = *pd;
+           *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
            if (pm)
                pm++;
 
-            s = combine1 (ps++, pm);
-            d = *pd;
-            *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+           s = combine1 (ps++, pm);
+           d = *pd;
+           *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
            if (pm)
                pm++;
 
-            s = combine1 (ps++, pm);
-            d = *pd;
-            *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+           s = combine1 (ps++, pm);
+           d = *pd;
+           *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
            if (pm)
                pm++;
-        }
-        else
-        {
-            save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
+       }
+       else
+       {
+           save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
 
-            pd += 4;
-            ps += 4;
+           pd += 4;
+           ps += 4;
            if (pm)
                pm += 4;
-        }
+       }
 
-        w -= 4;
+       w -= 4;
     }
 
     while (w--)
     {
-        s = combine1 (ps, pm);
-        d = *pd;
-        *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+       s = combine1 (ps, pm);
+       d = *pd;
+
+       *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
        ps++;
        if (pm)
            pm++;
@@ -1431,7 +1601,10 @@ core_combine_saturate_u_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *
 }
 
 static force_inline void
-core_combine_src_ca_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
+core_combine_src_ca_sse2 (uint32_t*       pd,
+                          const uint32_t* ps,
+                          const uint32_t *pm,
+                          int             w)
 {
     uint32_t s, m;
 
@@ -1446,10 +1619,11 @@ core_combine_src_ca_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm,
 
     while (w && (unsigned long)pd & 15)
     {
-        s = *ps++;
-        m = *pm++;
-        *pd++ = pack_1x64_32 (pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
-        w--;
+       s = *ps++;
+       m = *pm++;
+       *pd++ = pack_1x64_32 (
+           pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
+       w--;
     }
 
     /* call prefetch hint to optimize cache load*/
@@ -1459,38 +1633,44 @@ core_combine_src_ca_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm,
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cache_prefetch_next ((__m128i*)ps);
-        cache_prefetch_next ((__m128i*)pd);
-        cache_prefetch_next ((__m128i*)pm);
+       /* fill cache line with next memory */
+       cache_prefetch_next ((__m128i*)ps);
+       cache_prefetch_next ((__m128i*)pd);
+       cache_prefetch_next ((__m128i*)pm);
 
-        xmm_src_hi = load_128_unaligned ((__m128i*)ps);
-        xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+       xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+       xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
 
-        unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-        unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
 
-        pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+       pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+                           &xmm_mask_lo, &xmm_mask_hi,
+                           &xmm_dst_lo, &xmm_dst_hi);
 
-        save_128_aligned( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 
-        ps += 4;
-        pd += 4;
-        pm += 4;
-        w -= 4;
+       ps += 4;
+       pd += 4;
+       pm += 4;
+       w -= 4;
     }
 
     while (w)
     {
-        s = *ps++;
-        m = *pm++;
-        *pd++ = pack_1x64_32 (pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
-        w--;
+       s = *ps++;
+       m = *pm++;
+       *pd++ = pack_1x64_32 (
+           pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
+       w--;
     }
 }
 
 static force_inline uint32_t
-core_combine_over_ca_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst)
+core_combine_over_ca_pixel_sse2 (uint32_t src,
+                                 uint32_t mask,
+                                 uint32_t dst)
 {
     __m64 s = unpack_32_1x64 (src);
     __m64 expAlpha = expand_alpha_1x64 (s);
@@ -1501,7 +1681,10 @@ core_combine_over_ca_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst)
 }
 
 static force_inline void
-core_combine_over_ca_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
+core_combine_over_ca_sse2 (uint32_t*       pd,
+                           const uint32_t* ps,
+                           const uint32_t *pm,
+                           int             w)
 {
     uint32_t s, m, d;
 
@@ -1517,12 +1700,12 @@ core_combine_over_ca_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm,
 
     while (w && (unsigned long)pd & 15)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
 
-        *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
-        w--;
+       *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
+       w--;
     }
 
     /* call prefetch hint to optimize cache load*/
@@ -1532,52 +1715,65 @@ core_combine_over_ca_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm,
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cache_prefetch_next ((__m128i*)ps);
-        cache_prefetch_next ((__m128i*)pd);
-        cache_prefetch_next ((__m128i*)pm);
+       /* fill cache line with next memory */
+       cache_prefetch_next ((__m128i*)ps);
+       cache_prefetch_next ((__m128i*)pd);
+       cache_prefetch_next ((__m128i*)pm);
 
-        xmm_dst_hi = load_128_aligned ((__m128i*)pd);
-        xmm_src_hi = load_128_unaligned ((__m128i*)ps);
-        xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+       xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+       xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+       xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
 
-        unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-        unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-        unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
 
-        expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+       expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+                           &xmm_alpha_lo, &xmm_alpha_hi);
 
-        in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+       in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+                      &xmm_alpha_lo, &xmm_alpha_hi,
+                      &xmm_mask_lo, &xmm_mask_hi,
+                      &xmm_dst_lo, &xmm_dst_hi);
 
-        save_128_aligned( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 
-        ps += 4;
-        pd += 4;
-        pm += 4;
-        w -= 4;
+       ps += 4;
+       pd += 4;
+       pm += 4;
+       w -= 4;
     }
 
     while (w)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
 
-        *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
-        w--;
+       *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
+       w--;
     }
 }
 
 static force_inline uint32_t
-core_combine_over_reverse_ca_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst)
+core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
+                                         uint32_t mask,
+                                         uint32_t dst)
 {
     __m64 d = unpack_32_1x64 (dst);
 
-       return pack_1x64_32(over_1x64 (d, expand_alpha_1x64 (d), pix_multiply_1x64 (unpack_32_1x64 (src), unpack_32_1x64 (mask))));
+    return pack_1x64_32 (
+       over_1x64 (d, expand_alpha_1x64 (d),
+                  pix_multiply_1x64 (unpack_32_1x64 (src),
+                                     unpack_32_1x64 (mask))));
 }
 
 static force_inline void
-core_combine_over_reverse_ca_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
+core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
+                                   const uint32_t* ps,
+                                   const uint32_t *pm,
+                                   int             w)
 {
     uint32_t s, m, d;
 
@@ -1593,12 +1789,12 @@ core_combine_over_reverse_ca_sse2 (uint32_t* pd, const uint32_t* ps, const uint3
 
     while (w && (unsigned long)pd & 15)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
 
-        *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
-        w--;
+       *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
+       w--;
     }
 
     /* call prefetch hint to optimize cache load*/
@@ -1608,45 +1804,54 @@ core_combine_over_reverse_ca_sse2 (uint32_t* pd, const uint32_t* ps, const uint3
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cache_prefetch_next ((__m128i*)ps);
-        cache_prefetch_next ((__m128i*)pd);
-        cache_prefetch_next ((__m128i*)pm);
+       /* fill cache line with next memory */
+       cache_prefetch_next ((__m128i*)ps);
+       cache_prefetch_next ((__m128i*)pd);
+       cache_prefetch_next ((__m128i*)pm);
 
-        xmm_dst_hi = load_128_aligned ((__m128i*)pd);
-        xmm_src_hi = load_128_unaligned ((__m128i*)ps);
-        xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+       xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+       xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+       xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
 
-        unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-        unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-        unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
 
-        expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_alpha_lo, &xmm_alpha_hi);
-        pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+       expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+                           &xmm_alpha_lo, &xmm_alpha_hi);
+       pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+                           &xmm_mask_lo, &xmm_mask_hi,
+                           &xmm_mask_lo, &xmm_mask_hi);
 
-        over_2x128 (&xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_mask_lo, &xmm_mask_hi);
+       over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+                   &xmm_alpha_lo, &xmm_alpha_hi,
+                   &xmm_mask_lo, &xmm_mask_hi);
 
-        save_128_aligned( (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
 
-        ps += 4;
-        pd += 4;
-        pm += 4;
-        w -= 4;
+       ps += 4;
+       pd += 4;
+       pm += 4;
+       w -= 4;
     }
 
     while (w)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
 
-        *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
-        w--;
+       *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
+       w--;
     }
 }
 
 static force_inline void
-core_combine_in_ca_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
+core_combine_in_ca_sse2 (uint32_t *      pd,
+                         const uint32_t *ps,
+                         const uint32_t *pm,
+                         int             w)
 {
     uint32_t s, m, d;
 
@@ -1662,13 +1867,16 @@ core_combine_in_ca_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, i
 
     while (w && (unsigned long)pd & 15)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
 
-        *pd++ = pack_1x64_32 (pix_multiply_1x64 (pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
-                                                expand_alpha_1x64 (unpack_32_1x64 (d))));
-        w--;
+       *pd++ = pack_1x64_32 (
+           pix_multiply_1x64 (
+               pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
+               expand_alpha_1x64 (unpack_32_1x64 (d))));
+
+       w--;
     }
 
     /* call prefetch hint to optimize cache load*/
@@ -1678,46 +1886,60 @@ core_combine_in_ca_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, i
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cache_prefetch_next ((__m128i*)ps);
-        cache_prefetch_next ((__m128i*)pd);
-        cache_prefetch_next ((__m128i*)pm);
+       /* fill cache line with next memory */
+       cache_prefetch_next ((__m128i*)ps);
+       cache_prefetch_next ((__m128i*)pd);
+       cache_prefetch_next ((__m128i*)pm);
 
-        xmm_dst_hi = load_128_aligned ((__m128i*)pd);
-        xmm_src_hi = load_128_unaligned ((__m128i*)ps);
-        xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+       xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+       xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+       xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
 
-        unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-        unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-        unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
 
-        expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_alpha_lo, &xmm_alpha_hi);
-        pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+       expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+                           &xmm_alpha_lo, &xmm_alpha_hi);
 
-        pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_dst_lo, &xmm_dst_hi);
+       pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+                           &xmm_mask_lo, &xmm_mask_hi,
+                           &xmm_dst_lo, &xmm_dst_hi);
 
-        save_128_aligned( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+       pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+                           &xmm_alpha_lo, &xmm_alpha_hi,
+                           &xmm_dst_lo, &xmm_dst_hi);
 
-        ps += 4;
-        pd += 4;
-        pm += 4;
-        w -= 4;
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+       ps += 4;
+       pd += 4;
+       pm += 4;
+       w -= 4;
     }
 
     while (w)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
+
+       *pd++ = pack_1x64_32 (
+           pix_multiply_1x64 (
+               pix_multiply_1x64 (
+                   unpack_32_1x64 (s), unpack_32_1x64 (m)),
+               expand_alpha_1x64 (unpack_32_1x64 (d))));
 
-        *pd++ = pack_1x64_32 (pix_multiply_1x64 (pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
-                                                expand_alpha_1x64 (unpack_32_1x64 (d))));
-        w--;
+       w--;
     }
 }
 
 static force_inline void
-core_combine_in_reverse_ca_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
+core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
+                                 const uint32_t *ps,
+                                 const uint32_t *pm,
+                                 int             w)
 {
     uint32_t s, m, d;
 
@@ -1733,14 +1955,16 @@ core_combine_in_reverse_ca_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_
 
     while (w && (unsigned long)pd & 15)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
-
-        *pd++ = pack_1x64_32 (pix_multiply_1x64 (unpack_32_1x64 (d),
-                                                pix_multiply_1x64 (unpack_32_1x64 (m),
-                                                                  expand_alpha_1x64 (unpack_32_1x64 (s)))));
-        w--;
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
+
+       *pd++ = pack_1x64_32 (
+           pix_multiply_1x64 (
+               unpack_32_1x64 (d),
+               pix_multiply_1x64 (unpack_32_1x64 (m),
+                                  expand_alpha_1x64 (unpack_32_1x64 (s)))));
+       w--;
     }
 
     /* call prefetch hint to optimize cache load*/
@@ -1750,47 +1974,58 @@ core_combine_in_reverse_ca_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cache_prefetch_next ((__m128i*)ps);
-        cache_prefetch_next ((__m128i*)pd);
-        cache_prefetch_next ((__m128i*)pm);
+       /* fill cache line with next memory */
+       cache_prefetch_next ((__m128i*)ps);
+       cache_prefetch_next ((__m128i*)pd);
+       cache_prefetch_next ((__m128i*)pm);
 
-        xmm_dst_hi = load_128_aligned ((__m128i*)pd);
-        xmm_src_hi = load_128_unaligned ((__m128i*)ps);
-        xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+       xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+       xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+       xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
 
-        unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-        unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-        unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
 
-        expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
-        pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+       expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+                           &xmm_alpha_lo, &xmm_alpha_hi);
+       pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+                           &xmm_alpha_lo, &xmm_alpha_hi,
+                           &xmm_alpha_lo, &xmm_alpha_hi);
 
-        pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_dst_lo, &xmm_dst_hi);
+       pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+                           &xmm_alpha_lo, &xmm_alpha_hi,
+                           &xmm_dst_lo, &xmm_dst_hi);
 
-        save_128_aligned( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 
-        ps += 4;
-        pd += 4;
-        pm += 4;
-        w -= 4;
+       ps += 4;
+       pd += 4;
+       pm += 4;
+       w -= 4;
     }
 
     while (w)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
-
-        *pd++ = pack_1x64_32 (pix_multiply_1x64 (unpack_32_1x64 (d),
-                                                pix_multiply_1x64 (unpack_32_1x64 (m),
-                                                                  expand_alpha_1x64 (unpack_32_1x64 (s)))));
-        w--;
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
+
+       *pd++ = pack_1x64_32 (
+           pix_multiply_1x64 (
+               unpack_32_1x64 (d),
+               pix_multiply_1x64 (unpack_32_1x64 (m),
+                                  expand_alpha_1x64 (unpack_32_1x64 (s)))));
+       w--;
     }
 }
 
 static force_inline void
-core_combine_out_ca_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
+core_combine_out_ca_sse2 (uint32_t *      pd,
+                          const uint32_t *ps,
+                          const uint32_t *pm,
+                          int             w)
 {
     uint32_t s, m, d;
 
@@ -1806,13 +2041,16 @@ core_combine_out_ca_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm,
 
     while (w && (unsigned long)pd & 15)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
-
-        *pd++ = pack_1x64_32 (pix_multiply_1x64 (pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
-                                                negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
-        w--;
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
+
+       *pd++ = pack_1x64_32 (
+           pix_multiply_1x64 (
+               pix_multiply_1x64 (
+                   unpack_32_1x64 (s), unpack_32_1x64 (m)),
+               negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
+       w--;
     }
 
     /* call prefetch hint to optimize cache load*/
@@ -1822,47 +2060,61 @@ core_combine_out_ca_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm,
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cache_prefetch_next ((__m128i*)ps);
-        cache_prefetch_next ((__m128i*)pd);
-        cache_prefetch_next ((__m128i*)pm);
-
-        xmm_dst_hi = load_128_aligned ((__m128i*)pd);
-        xmm_src_hi = load_128_unaligned ((__m128i*)ps);
-        xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
-        unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-        unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-        unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
-        expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_alpha_lo, &xmm_alpha_hi);
-        negate_2x128 (xmm_alpha_lo, xmm_alpha_hi, &xmm_alpha_lo, &xmm_alpha_hi);
-
-        pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
-        pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_dst_lo, &xmm_dst_hi);
-
-        save_128_aligned( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+       /* fill cache line with next memory */
+       cache_prefetch_next ((__m128i*)ps);
+       cache_prefetch_next ((__m128i*)pd);
+       cache_prefetch_next ((__m128i*)pm);
 
-        ps += 4;
-        pd += 4;
-        pm += 4;
-        w -= 4;
+       xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+       xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+       xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+       expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+                           &xmm_alpha_lo, &xmm_alpha_hi);
+       negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
+                     &xmm_alpha_lo, &xmm_alpha_hi);
+
+       pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+                           &xmm_mask_lo, &xmm_mask_hi,
+                           &xmm_dst_lo, &xmm_dst_hi);
+       pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+                           &xmm_alpha_lo, &xmm_alpha_hi,
+                           &xmm_dst_lo, &xmm_dst_hi);
+
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+       ps += 4;
+       pd += 4;
+       pm += 4;
+       w -= 4;
     }
 
     while (w)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
+
+       *pd++ = pack_1x64_32 (
+           pix_multiply_1x64 (
+               pix_multiply_1x64 (
+                   unpack_32_1x64 (s), unpack_32_1x64 (m)),
+               negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
 
-        *pd++ = pack_1x64_32 (pix_multiply_1x64 (pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
-                                                negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
-        w--;
+       w--;
     }
 }
 
 static force_inline void
-core_combine_out_reverse_ca_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
+core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
+                                  const uint32_t *ps,
+                                  const uint32_t *pm,
+                                  int             w)
 {
     uint32_t s, m, d;
 
@@ -1878,14 +2130,17 @@ core_combine_out_reverse_ca_sse2 (uint32_t *pd, const uint32_t *ps, const uint32
 
     while (w && (unsigned long)pd & 15)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
-
-        *pd++ = pack_1x64_32 (pix_multiply_1x64 (unpack_32_1x64 (d),
-                                                negate_1x64 (pix_multiply_1x64 (unpack_32_1x64 (m),
-                                                                               expand_alpha_1x64 (unpack_32_1x64 (s))))));
-        w--;
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
+
+       *pd++ = pack_1x64_32 (
+           pix_multiply_1x64 (
+               unpack_32_1x64 (d),
+               negate_1x64 (pix_multiply_1x64 (
+                                unpack_32_1x64 (m),
+                                expand_alpha_1x64 (unpack_32_1x64 (s))))));
+       w--;
     }
 
     /* call prefetch hint to optimize cache load*/
@@ -1895,50 +2150,62 @@ core_combine_out_reverse_ca_sse2 (uint32_t *pd, const uint32_t *ps, const uint32
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cache_prefetch_next ((__m128i*)ps);
-        cache_prefetch_next ((__m128i*)pd);
-        cache_prefetch_next ((__m128i*)pm);
+       /* fill cache line with next memory */
+       cache_prefetch_next ((__m128i*)ps);
+       cache_prefetch_next ((__m128i*)pd);
+       cache_prefetch_next ((__m128i*)pm);
 
-        xmm_dst_hi = load_128_aligned ((__m128i*)pd);
-        xmm_src_hi = load_128_unaligned ((__m128i*)ps);
-        xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+       xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+       xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+       xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
 
-        unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-        unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-        unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
 
-        expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+       expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+                           &xmm_alpha_lo, &xmm_alpha_hi);
 
-        pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_mask_lo, &xmm_mask_hi);
+       pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+                           &xmm_alpha_lo, &xmm_alpha_hi,
+                           &xmm_mask_lo, &xmm_mask_hi);
 
-        negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+       negate_2x128 (xmm_mask_lo, xmm_mask_hi,
+                     &xmm_mask_lo, &xmm_mask_hi);
 
-        pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+       pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+                           &xmm_mask_lo, &xmm_mask_hi,
+                           &xmm_dst_lo, &xmm_dst_hi);
 
-        save_128_aligned( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 
-        ps += 4;
-        pd += 4;
-        pm += 4;
-        w -= 4;
+       ps += 4;
+       pd += 4;
+       pm += 4;
+       w -= 4;
     }
 
     while (w)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
-
-        *pd++ = pack_1x64_32 (pix_multiply_1x64 (unpack_32_1x64 (d),
-                                                negate_1x64 (pix_multiply_1x64 (unpack_32_1x64 (m),
-                                                                               expand_alpha_1x64 (unpack_32_1x64 (s))))));
-        w--;
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
+
+       *pd++ = pack_1x64_32 (
+           pix_multiply_1x64 (
+               unpack_32_1x64 (d),
+               negate_1x64 (pix_multiply_1x64 (
+                                unpack_32_1x64 (m),
+                                expand_alpha_1x64 (unpack_32_1x64 (s))))));
+       w--;
     }
 }
 
 static force_inline uint32_t
-core_combine_atop_ca_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst)
+core_combine_atop_ca_pixel_sse2 (uint32_t src,
+                                 uint32_t mask,
+                                 uint32_t dst)
 {
     __m64 m = unpack_32_1x64 (mask);
     __m64 s = unpack_32_1x64 (src);
@@ -1953,7 +2220,10 @@ core_combine_atop_ca_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst)
 }
 
 static force_inline void
-core_combine_atop_ca_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
+core_combine_atop_ca_sse2 (uint32_t *      pd,
+                           const uint32_t *ps,
+                           const uint32_t *pm,
+                           int             w)
 {
     uint32_t s, m, d;
 
@@ -1970,12 +2240,12 @@ core_combine_atop_ca_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm,
 
     while (w && (unsigned long)pd & 15)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
 
-        *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
-        w--;
+       *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
+       w--;
     }
 
     /* call prefetch hint to optimize cache load*/
@@ -1985,52 +2255,62 @@ core_combine_atop_ca_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm,
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cache_prefetch_next ((__m128i*)ps);
-        cache_prefetch_next ((__m128i*)pd);
-        cache_prefetch_next ((__m128i*)pm);
+       /* fill cache line with next memory */
+       cache_prefetch_next ((__m128i*)ps);
+       cache_prefetch_next ((__m128i*)pd);
+       cache_prefetch_next ((__m128i*)pm);
 
-        xmm_dst_hi = load_128_aligned ((__m128i*)pd);
-        xmm_src_hi = load_128_unaligned ((__m128i*)ps);
-        xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+       xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+       xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+       xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
 
-        unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-        unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-        unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
 
-        expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi);
-        expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+       expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+                           &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+       expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+                           &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
 
-        pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_src_lo, &xmm_src_hi);
-        pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, &xmm_mask_lo, &xmm_mask_hi);
+       pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+                           &xmm_mask_lo, &xmm_mask_hi,
+                           &xmm_src_lo, &xmm_src_hi);
+       pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+                           &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+                           &xmm_mask_lo, &xmm_mask_hi);
 
-        negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+       negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
 
-        pix_add_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
-                              &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
-                              &xmm_dst_lo, &xmm_dst_hi);
+       pix_add_multiply_2x128 (
+           &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+           &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+           &xmm_dst_lo, &xmm_dst_hi);
 
-        save_128_aligned( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 
-        ps += 4;
-        pd += 4;
-        pm += 4;
-        w -= 4;
+       ps += 4;
+       pd += 4;
+       pm += 4;
+       w -= 4;
     }
 
     while (w)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
 
-        *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
-        w--;
+       *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
+       w--;
     }
 }
 
 static force_inline uint32_t
-core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst)
+core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
+                                         uint32_t mask,
+                                         uint32_t dst)
 {
     __m64 m = unpack_32_1x64 (mask);
     __m64 s = unpack_32_1x64 (src);
@@ -2046,7 +2326,10 @@ core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t d
 }
 
 static force_inline void
-core_combine_reverse_atop_ca_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
+core_combine_reverse_atop_ca_sse2 (uint32_t *      pd,
+                                   const uint32_t *ps,
+                                   const uint32_t *pm,
+                                   int             w)
 {
     uint32_t s, m, d;
 
@@ -2063,12 +2346,12 @@ core_combine_reverse_atop_ca_sse2 (uint32_t *pd, const uint32_t *ps, const uint3
 
     while (w && (unsigned long)pd & 15)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
 
-        *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
-        w--;
+       *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
+       w--;
     }
 
     /* call prefetch hint to optimize cache load*/
@@ -2078,69 +2361,84 @@ core_combine_reverse_atop_ca_sse2 (uint32_t *pd, const uint32_t *ps, const uint3
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cache_prefetch_next ((__m128i*)ps);
-        cache_prefetch_next ((__m128i*)pd);
-        cache_prefetch_next ((__m128i*)pm);
-
-        xmm_dst_hi = load_128_aligned ((__m128i*)pd);
-        xmm_src_hi = load_128_unaligned ((__m128i*)ps);
-        xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
-        unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-        unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-        unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
-        expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi);
-        expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
-        pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_src_lo, &xmm_src_hi);
-        pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, &xmm_mask_lo, &xmm_mask_hi);
-
-        negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
-        pix_add_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
-                              &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
-                              &xmm_dst_lo, &xmm_dst_hi);
-
-        save_128_aligned( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+       /* fill cache line with next memory */
+       cache_prefetch_next ((__m128i*)ps);
+       cache_prefetch_next ((__m128i*)pd);
+       cache_prefetch_next ((__m128i*)pm);
 
-        ps += 4;
-        pd += 4;
-        pm += 4;
-        w -= 4;
+       xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+       xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+       xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+       expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+                           &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+       expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+                           &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+       pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+                           &xmm_mask_lo, &xmm_mask_hi,
+                           &xmm_src_lo, &xmm_src_hi);
+       pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+                           &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+                           &xmm_mask_lo, &xmm_mask_hi);
+
+       negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+                     &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+       pix_add_multiply_2x128 (
+           &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+           &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+           &xmm_dst_lo, &xmm_dst_hi);
+
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+       ps += 4;
+       pd += 4;
+       pm += 4;
+       w -= 4;
     }
 
     while (w)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
 
-        *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
-        w--;
+       *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
+       w--;
     }
 }
 
 static force_inline uint32_t
-core_combine_xor_ca_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst)
+core_combine_xor_ca_pixel_sse2 (uint32_t src,
+                                uint32_t mask,
+                                uint32_t dst)
 {
     __m64 a = unpack_32_1x64 (mask);
     __m64 s = unpack_32_1x64 (src);
     __m64 d = unpack_32_1x64 (dst);
 
-    __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (a, expand_alpha_1x64 (s)));
+    __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
+                                      a, expand_alpha_1x64 (s)));
     __m64 dest      = pix_multiply_1x64 (s, a);
     __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
 
     return pack_1x64_32 (pix_add_multiply_1x64 (&d,
-                                              &alpha_dst,
-                                              &dest,
-                                              &alpha_src));
+                                                &alpha_dst,
+                                                &dest,
+                                                &alpha_src));
 }
 
 static force_inline void
-core_combine_xor_ca_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
+core_combine_xor_ca_sse2 (uint32_t *      pd,
+                          const uint32_t *ps,
+                          const uint32_t *pm,
+                          int             w)
 {
     uint32_t s, m, d;
 
@@ -2157,12 +2455,12 @@ core_combine_xor_ca_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm,
 
     while (w && (unsigned long)pd & 15)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
 
-        *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
-        w--;
+       *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
+       w--;
     }
 
     /* call prefetch hint to optimize cache load*/
@@ -2172,53 +2470,66 @@ core_combine_xor_ca_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm,
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cache_prefetch_next ((__m128i*)ps);
-        cache_prefetch_next ((__m128i*)pd);
-        cache_prefetch_next ((__m128i*)pm);
-
-        xmm_dst_hi = load_128_aligned ((__m128i*)pd);
-        xmm_src_hi = load_128_unaligned ((__m128i*)ps);
-        xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-
-        unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-        unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-        unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
-        expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi);
-        expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-
-        pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_src_lo, &xmm_src_hi);
-        pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, &xmm_mask_lo, &xmm_mask_hi);
-
-        negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
-        negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
-        pix_add_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
-                              &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
-                              &xmm_dst_lo, &xmm_dst_hi);
-
-        save_128_aligned( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+       /* fill cache line with next memory */
+       cache_prefetch_next ((__m128i*)ps);
+       cache_prefetch_next ((__m128i*)pd);
+       cache_prefetch_next ((__m128i*)pm);
 
-        ps += 4;
-        pd += 4;
-        pm += 4;
-        w -= 4;
+       xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+       xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+       xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+       expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+                           &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+       expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+                           &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+       pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+                           &xmm_mask_lo, &xmm_mask_hi,
+                           &xmm_src_lo, &xmm_src_hi);
+       pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+                           &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+                           &xmm_mask_lo, &xmm_mask_hi);
+
+       negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+                     &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+       negate_2x128 (xmm_mask_lo, xmm_mask_hi,
+                     &xmm_mask_lo, &xmm_mask_hi);
+
+       pix_add_multiply_2x128 (
+           &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+           &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+           &xmm_dst_lo, &xmm_dst_hi);
+
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+       ps += 4;
+       pd += 4;
+       pm += 4;
+       w -= 4;
     }
 
     while (w)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
 
-        *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
-        w--;
+       *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
+       w--;
     }
 }
 
 static force_inline void
-core_combine_add_ca_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
+core_combine_add_ca_sse2 (uint32_t *      pd,
+                          const uint32_t *ps,
+                          const uint32_t *pm,
+                          int             w)
 {
     uint32_t s, m, d;
 
@@ -2233,14 +2544,15 @@ core_combine_add_ca_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm,
 
     while (w && (unsigned long)pd & 15)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
-
-        *pd++ = pack_1x64_32 (_mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
-                                                              unpack_32_1x64 (m)),
-                                            unpack_32_1x64 (d)));
-        w--;
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
+
+       *pd++ = pack_1x64_32 (
+           _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
+                                            unpack_32_1x64 (m)),
+                         unpack_32_1x64 (d)));
+       w--;
     }
 
     /* call prefetch hint to optimize cache load*/
@@ -2250,44 +2562,49 @@ core_combine_add_ca_sse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm,
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cache_prefetch_next ((__m128i*)ps);
-        cache_prefetch_next ((__m128i*)pd);
-        cache_prefetch_next ((__m128i*)pm);
+       /* fill cache line with next memory */
+       cache_prefetch_next ((__m128i*)ps);
+       cache_prefetch_next ((__m128i*)pd);
+       cache_prefetch_next ((__m128i*)pm);
 
-        xmm_src_hi = load_128_unaligned ((__m128i*)ps);
-        xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
-        xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+       xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+       xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+       xmm_dst_hi = load_128_aligned ((__m128i*)pd);
 
-        unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-        unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-        unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 
-        pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_src_lo, &xmm_src_hi);
+       pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+                           &xmm_mask_lo, &xmm_mask_hi,
+                           &xmm_src_lo, &xmm_src_hi);
 
-        save_128_aligned( (__m128i*)pd, pack_2x128_128 (_mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
-                                                      _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (
+               _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
+               _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
 
-        ps += 4;
-        pd += 4;
-        pm += 4;
-        w -= 4;
+       ps += 4;
+       pd += 4;
+       pm += 4;
+       w -= 4;
     }
 
     while (w)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
-
-        *pd++ = pack_1x64_32 (_mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
-                                                              unpack_32_1x64 (m)),
-                                            unpack_32_1x64 (d)));
-        w--;
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
+
+       *pd++ = pack_1x64_32 (
+           _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
+                                            unpack_32_1x64 (m)),
+                         unpack_32_1x64 (d)));
+       w--;
     }
 }
 
-/* -------------------------------------------------------------------------------------------------
+/* ---------------------------------------------------
  * fb_compose_setup_sSE2
  */
 static force_inline __m64
@@ -2303,13 +2620,15 @@ create_mask_16_128 (uint16_t mask)
 }
 
 static force_inline __m64
-create_mask_2x32_64 (uint32_t mask0, uint32_t mask1)
+create_mask_2x32_64 (uint32_t mask0,
+                     uint32_t mask1)
 {
     return _mm_set_pi32 (mask0, mask1);
 }
 
 static force_inline __m128i
-create_mask_2x32_128 (uint32_t mask0, uint32_t mask1)
+create_mask_2x32_128 (uint32_t mask0,
+                      uint32_t mask1)
 {
     return _mm_set_epi32 (mask0, mask1, mask0, mask1);
 }
@@ -2317,382 +2636,484 @@ create_mask_2x32_128 (uint32_t mask0, uint32_t mask1)
 /* SSE2 code patch for fbcompose.c */
 
 static void
-sse2_combine_over_u (pixman_implementation_t *imp, pixman_op_t op,
-                 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
+sse2_combine_over_u (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dst,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
 {
     core_combine_over_u_sse2 (dst, src, mask, width);
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-sse2_combine_over_reverse_u (pixman_implementation_t *imp, pixman_op_t op,
-                        uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
+sse2_combine_over_reverse_u (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               dst,
+                             const uint32_t *         src,
+                             const uint32_t *         mask,
+                             int                      width)
 {
     core_combine_over_reverse_u_sse2 (dst, src, mask, width);
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-sse2_combine_in_u (pixman_implementation_t *imp, pixman_op_t op,
-               uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
+sse2_combine_in_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dst,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
 {
     core_combine_in_u_sse2 (dst, src, mask, width);
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-sse2_combine_in_reverse_u (pixman_implementation_t *imp, pixman_op_t op,
-                      uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
+sse2_combine_in_reverse_u (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               dst,
+                           const uint32_t *         src,
+                           const uint32_t *         mask,
+                           int                      width)
 {
     core_combine_reverse_in_u_sse2 (dst, src, mask, width);
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-sse2_combine_out_u (pixman_implementation_t *imp, pixman_op_t op,
-                uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
+sse2_combine_out_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dst,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
 {
     core_combine_out_u_sse2 (dst, src, mask, width);
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-sse2_combine_out_reverse_u (pixman_implementation_t *imp, pixman_op_t op,
-                       uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
+sse2_combine_out_reverse_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dst,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
 {
     core_combine_reverse_out_u_sse2 (dst, src, mask, width);
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-sse2_combine_atop_u (pixman_implementation_t *imp, pixman_op_t op,
-                 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
+sse2_combine_atop_u (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dst,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
 {
     core_combine_atop_u_sse2 (dst, src, mask, width);
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-sse2_combine_atop_reverse_u (pixman_implementation_t *imp, pixman_op_t op,
-                        uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
+sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               dst,
+                             const uint32_t *         src,
+                             const uint32_t *         mask,
+                             int                      width)
 {
     core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-sse2_combine_xor_u (pixman_implementation_t *imp, pixman_op_t op,
-                uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
+sse2_combine_xor_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dst,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
 {
     core_combine_xor_u_sse2 (dst, src, mask, width);
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-sse2_combine_add_u (pixman_implementation_t *imp, pixman_op_t op,
-                uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
+sse2_combine_add_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dst,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
 {
     core_combine_add_u_sse2 (dst, src, mask, width);
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-sse2_combine_saturate_u (pixman_implementation_t *imp, pixman_op_t op,
-                     uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
+sse2_combine_saturate_u (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         uint32_t *               dst,
+                         const uint32_t *         src,
+                         const uint32_t *         mask,
+                         int                      width)
 {
     core_combine_saturate_u_sse2 (dst, src, mask, width);
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-sse2_combine_src_ca (pixman_implementation_t *imp, pixman_op_t op,
-                uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
+sse2_combine_src_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dst,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
 {
     core_combine_src_ca_sse2 (dst, src, mask, width);
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-sse2_combine_over_ca (pixman_implementation_t *imp, pixman_op_t op,
-                 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
+sse2_combine_over_ca (pixman_implementation_t *imp,
+                      pixman_op_t              op,
+                      uint32_t *               dst,
+                      const uint32_t *         src,
+                      const uint32_t *         mask,
+                      int                      width)
 {
     core_combine_over_ca_sse2 (dst, src, mask, width);
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-sse2_combine_over_reverse_ca (pixman_implementation_t *imp, pixman_op_t op,
-                        uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
+sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              uint32_t *               dst,
+                              const uint32_t *         src,
+                              const uint32_t *         mask,
+                              int                      width)
 {
     core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-sse2_combine_in_ca (pixman_implementation_t *imp, pixman_op_t op,
-               uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
+sse2_combine_in_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dst,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
 {
     core_combine_in_ca_sse2 (dst, src, mask, width);
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-sse2_combine_in_reverse_ca (pixman_implementation_t *imp, pixman_op_t op,
-                      uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
+sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dst,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
 {
     core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-sse2_combine_out_ca (pixman_implementation_t *imp, pixman_op_t op,
-                uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
+sse2_combine_out_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dst,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
 {
     core_combine_out_ca_sse2 (dst, src, mask, width);
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-sse2_combine_out_reverse_ca (pixman_implementation_t *imp, pixman_op_t op,
-                       uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
+sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               dst,
+                             const uint32_t *         src,
+                             const uint32_t *         mask,
+                             int                      width)
 {
     core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-sse2_combine_atop_ca (pixman_implementation_t *imp, pixman_op_t op,
-                 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
+sse2_combine_atop_ca (pixman_implementation_t *imp,
+                      pixman_op_t              op,
+                      uint32_t *               dst,
+                      const uint32_t *         src,
+                      const uint32_t *         mask,
+                      int                      width)
 {
     core_combine_atop_ca_sse2 (dst, src, mask, width);
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-sse2_combine_atop_reverse_ca (pixman_implementation_t *imp, pixman_op_t op,
-                        uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
+sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              uint32_t *               dst,
+                              const uint32_t *         src,
+                              const uint32_t *         mask,
+                              int                      width)
 {
     core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-sse2_combine_xor_ca (pixman_implementation_t *imp, pixman_op_t op,
-                uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
+sse2_combine_xor_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dst,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
 {
     core_combine_xor_ca_sse2 (dst, src, mask, width);
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-sse2_combine_add_ca (pixman_implementation_t *imp, pixman_op_t op,
-                uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
+sse2_combine_add_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dst,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
 {
     core_combine_add_ca_sse2 (dst, src, mask, width);
-    _mm_empty();
+    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fast_composite_over_n_8888
+/* -------------------------------------------------------------------
+ * composite_over_n_8888
  */
 
 static void
 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
-                            pixman_op_t op,
-                           pixman_image_t * src_image,
-                           pixman_image_t * mask_image,
-                           pixman_image_t * dst_image,
-                           int32_t     src_x,
-                           int32_t     src_y,
-                           int32_t     mask_x,
-                           int32_t     mask_y,
-                           int32_t     dest_x,
-                           int32_t     dest_y,
-                           int32_t     width,
-                           int32_t     height)
-{
-    uint32_t   src;
-    uint32_t   *dst_line, *dst, d;
-    uint16_t   w;
-    int        dst_stride;
+                            pixman_op_t              op,
+                            pixman_image_t *         src_image,
+                            pixman_image_t *         mask_image,
+                            pixman_image_t *         dst_image,
+                            int32_t                  src_x,
+                            int32_t                  src_y,
+                            int32_t                  mask_x,
+                            int32_t                  mask_y,
+                            int32_t                  dest_x,
+                            int32_t                  dest_y,
+                            int32_t                  width,
+                            int32_t                  height)
+{
+    uint32_t src;
+    uint32_t    *dst_line, *dst, d;
+    uint16_t w;
+    int dst_stride;
     __m128i xmm_src, xmm_alpha;
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
 
-    src = _pixman_image_get_solid(src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     if (src == 0)
        return;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 
     xmm_src = expand_pixel_32_1x128 (src);
     xmm_alpha = expand_alpha_1x128 (xmm_src);
 
     while (height--)
     {
-        dst = dst_line;
+       dst = dst_line;
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)dst);
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)dst);
 
-        dst_line += dst_stride;
-        w = width;
+       dst_line += dst_stride;
+       w = width;
 
-        while (w && (unsigned long)dst & 15)
-        {
-            d = *dst;
-            *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
-                                              _mm_movepi64_pi64 (xmm_alpha),
-                                              unpack_32_1x64 (d)));
-            w--;
-        }
+       while (w && (unsigned long)dst & 15)
+       {
+           d = *dst;
+           *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
+                                             _mm_movepi64_pi64 (xmm_alpha),
+                                             unpack_32_1x64 (d)));
+           w--;
+       }
 
-        cache_prefetch ((__m128i*)dst);
+       cache_prefetch ((__m128i*)dst);
 
-        while (w >= 4)
-        {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i*)dst);
+       while (w >= 4)
+       {
+           /* fill cache line with next memory */
+           cache_prefetch_next ((__m128i*)dst);
 
-            xmm_dst = load_128_aligned ((__m128i*)dst);
+           xmm_dst = load_128_aligned ((__m128i*)dst);
 
-            unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+           unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
 
-            over_2x128 (&xmm_src, &xmm_src, &xmm_alpha, &xmm_alpha, &xmm_dst_lo, &xmm_dst_hi);
+           over_2x128 (&xmm_src, &xmm_src,
+                       &xmm_alpha, &xmm_alpha,
+                       &xmm_dst_lo, &xmm_dst_hi);
 
-            /* rebuid the 4 pixel data and save*/
-            save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+           /* rebuid the 4 pixel data and save*/
+           save_128_aligned (
+               (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 
-            w -= 4;
-            dst += 4;
-        }
+           w -= 4;
+           dst += 4;
+       }
 
-        while (w)
-        {
-            d = *dst;
-            *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
-                                              _mm_movepi64_pi64 (xmm_alpha),
-                                              unpack_32_1x64 (d)));
-            w--;
-        }
+       while (w)
+       {
+           d = *dst;
+           *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
+                                             _mm_movepi64_pi64 (xmm_alpha),
+                                             unpack_32_1x64 (d)));
+           w--;
+       }
 
     }
-    _mm_empty();
+    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fast_composite_over_n_0565
+/* ---------------------------------------------------------------------
+ * composite_over_n_0565
  */
 static void
 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
-                            pixman_op_t op,
-                           pixman_image_t * src_image,
-                           pixman_image_t * mask_image,
-                           pixman_image_t * dst_image,
-                           int32_t     src_x,
-                           int32_t     src_y,
-                           int32_t     mask_x,
-                           int32_t     mask_y,
-                           int32_t     dest_x,
-                           int32_t     dest_y,
-                           int32_t     width,
-                           int32_t     height)
-{
-    uint32_t   src;
-    uint16_t   *dst_line, *dst, d;
-    uint16_t   w;
-    int                dst_stride;
+                            pixman_op_t              op,
+                            pixman_image_t *         src_image,
+                            pixman_image_t *         mask_image,
+                            pixman_image_t *         dst_image,
+                            int32_t                  src_x,
+                            int32_t                  src_y,
+                            int32_t                  mask_x,
+                            int32_t                  mask_y,
+                            int32_t                  dest_x,
+                            int32_t                  dest_y,
+                            int32_t                  width,
+                            int32_t                  height)
+{
+    uint32_t src;
+    uint16_t    *dst_line, *dst, d;
+    uint16_t w;
+    int dst_stride;
     __m128i xmm_src, xmm_alpha;
     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
 
-    src = _pixman_image_get_solid(src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     if (src == 0)
-        return;
+       return;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
 
     xmm_src = expand_pixel_32_1x128 (src);
     xmm_alpha = expand_alpha_1x128 (xmm_src);
 
     while (height--)
     {
-        dst = dst_line;
+       dst = dst_line;
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)dst);
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)dst);
 
-        dst_line += dst_stride;
-        w = width;
+       dst_line += dst_stride;
+       w = width;
 
-        while (w && (unsigned long)dst & 15)
-        {
-            d = *dst;
+       while (w && (unsigned long)dst & 15)
+       {
+           d = *dst;
 
-            *dst++ = pack_565_32_16 (pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
-                                                             _mm_movepi64_pi64 (xmm_alpha),
-                                                             expand565_16_1x64 (d))));
-            w--;
-        }
+           *dst++ = pack_565_32_16 (
+               pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
+                                        _mm_movepi64_pi64 (xmm_alpha),
+                                        expand565_16_1x64 (d))));
+           w--;
+       }
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)dst);
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)dst);
 
-        while (w >= 8)
-        {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i*)dst);
+       while (w >= 8)
+       {
+           /* fill cache line with next memory */
+           cache_prefetch_next ((__m128i*)dst);
 
            xmm_dst = load_128_aligned ((__m128i*)dst);
-           
-           unpack_565_128_4x128 (xmm_dst, &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
-           
-            over_2x128 (&xmm_src, &xmm_src, &xmm_alpha, &xmm_alpha, &xmm_dst0, &xmm_dst1);
-            over_2x128 (&xmm_src, &xmm_src, &xmm_alpha, &xmm_alpha, &xmm_dst2, &xmm_dst3);
-
-            xmm_dst = pack_565_4x128_128 (&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
-            save_128_aligned ((__m128i*)dst, xmm_dst);
-
-            dst += 8;
-            w -= 8;
-        }
-
-        while (w--)
-        {
-            d = *dst;
-            *dst++ = pack_565_32_16 (pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
-                                                             _mm_movepi64_pi64 (xmm_alpha),
-                                                             expand565_16_1x64 (d))));
-        }
+
+           unpack_565_128_4x128 (xmm_dst,
+                                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+           over_2x128 (&xmm_src, &xmm_src,
+                       &xmm_alpha, &xmm_alpha,
+                       &xmm_dst0, &xmm_dst1);
+           over_2x128 (&xmm_src, &xmm_src,
+                       &xmm_alpha, &xmm_alpha,
+                       &xmm_dst2, &xmm_dst3);
+
+           xmm_dst = pack_565_4x128_128 (
+               &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+           save_128_aligned ((__m128i*)dst, xmm_dst);
+
+           dst += 8;
+           w -= 8;
+       }
+
+       while (w--)
+       {
+           d = *dst;
+           *dst++ = pack_565_32_16 (
+               pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
+                                        _mm_movepi64_pi64 (xmm_alpha),
+                                        expand565_16_1x64 (d))));
+       }
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fast_composite_over_n_8888_8888_ca
+/* ---------------------------------------------------------------------------
+ * composite_over_n_8888_8888_ca
  */
 
 static void
 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
-                                      pixman_op_t op,
-                                     pixman_image_t * src_image,
-                                     pixman_image_t * mask_image,
-                                     pixman_image_t * dst_image,
-                                     int32_t   src_x,
-                                     int32_t   src_y,
-                                     int32_t   mask_x,
-                                     int32_t   mask_y,
-                                     int32_t   dest_x,
-                                     int32_t   dest_y,
-                                     int32_t   width,
-                                     int32_t   height)
-{
-    uint32_t   src;
-    uint32_t   *dst_line, d;
-    uint32_t   *mask_line, m;
-    uint32_t    pack_cmp;
-    int        dst_stride, mask_stride;
+                                    pixman_op_t              op,
+                                    pixman_image_t *         src_image,
+                                    pixman_image_t *         mask_image,
+                                    pixman_image_t *         dst_image,
+                                    int32_t                  src_x,
+                                    int32_t                  src_y,
+                                    int32_t                  mask_x,
+                                    int32_t                  mask_y,
+                                    int32_t                  dest_x,
+                                    int32_t                  dest_y,
+                                    int32_t                  width,
+                                    int32_t                  height)
+{
+    uint32_t src;
+    uint32_t    *dst_line, d;
+    uint32_t    *mask_line, m;
+    uint32_t pack_cmp;
+    int dst_stride, mask_stride;
 
     __m128i xmm_src, xmm_alpha;
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
@@ -2700,253 +3121,266 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
 
     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
 
-    src = _pixman_image_get_solid(src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     if (src == 0)
        return;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
 
-    xmm_src = _mm_unpacklo_epi8 (create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
+    xmm_src = _mm_unpacklo_epi8 (
+       create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
     xmm_alpha = expand_alpha_1x128 (xmm_src);
     mmx_src   = _mm_movepi64_pi64 (xmm_src);
     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
 
     while (height--)
     {
-        int w = width;
-        const uint32_t *pm = (uint32_t *)mask_line;
-        uint32_t *pd = (uint32_t *)dst_line;
-
-        dst_line += dst_stride;
-        mask_line += mask_stride;
-
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)pd);
-        cache_prefetch ((__m128i*)pm);
-
-        while (w && (unsigned long)pd & 15)
-        {
-            m = *pm++;
-
-            if (m)
-            {
-                d = *pd;
-                mmx_mask = unpack_32_1x64 (m);
-                mmx_dest = unpack_32_1x64 (d);
-
-                *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
-                                                 &mmx_alpha,
-                                                 &mmx_mask,
-                                                 &mmx_dest));
-            }
-
-            pd++;
-            w--;
-        }
-
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)pd);
-        cache_prefetch ((__m128i*)pm);
-
-        while (w >= 4)
-        {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i*)pd);
-            cache_prefetch_next ((__m128i*)pm);
-
-            xmm_mask = load_128_unaligned ((__m128i*)pm);
-
-            pack_cmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128()));
-
-            /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
-            if (pack_cmp != 0xffff)
-            {
-                xmm_dst = load_128_aligned ((__m128i*)pd);
-
-                unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-                unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
-                in_over_2x128 (&xmm_src, &xmm_src, &xmm_alpha, &xmm_alpha, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
-
-                save_128_aligned ((__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-            }
-
-            pd += 4;
-            pm += 4;
-            w -= 4;
-        }
-
-        while (w)
-        {
-            m = *pm++;
-
-            if (m)
-            {
-                d = *pd;
-                mmx_mask = unpack_32_1x64 (m);
-                mmx_dest = unpack_32_1x64 (d);
-
-                *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
-                                                 &mmx_alpha,
-                                                 &mmx_mask,
-                                                 &mmx_dest));
-            }
-
-            pd++;
-            w--;
-        }
+       int w = width;
+       const uint32_t *pm = (uint32_t *)mask_line;
+       uint32_t *pd = (uint32_t *)dst_line;
+
+       dst_line += dst_stride;
+       mask_line += mask_stride;
+
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)pd);
+       cache_prefetch ((__m128i*)pm);
+
+       while (w && (unsigned long)pd & 15)
+       {
+           m = *pm++;
+
+           if (m)
+           {
+               d = *pd;
+               mmx_mask = unpack_32_1x64 (m);
+               mmx_dest = unpack_32_1x64 (d);
+
+               *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
+                                                 &mmx_alpha,
+                                                 &mmx_mask,
+                                                 &mmx_dest));
+           }
+
+           pd++;
+           w--;
+       }
+
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)pd);
+       cache_prefetch ((__m128i*)pm);
+
+       while (w >= 4)
+       {
+           /* fill cache line with next memory */
+           cache_prefetch_next ((__m128i*)pd);
+           cache_prefetch_next ((__m128i*)pm);
+
+           xmm_mask = load_128_unaligned ((__m128i*)pm);
+
+           pack_cmp =
+               _mm_movemask_epi8 (
+                   _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+           /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
+           if (pack_cmp != 0xffff)
+           {
+               xmm_dst = load_128_aligned ((__m128i*)pd);
+
+               unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+               unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+               in_over_2x128 (&xmm_src, &xmm_src,
+                              &xmm_alpha, &xmm_alpha,
+                              &xmm_mask_lo, &xmm_mask_hi,
+                              &xmm_dst_lo, &xmm_dst_hi);
+
+               save_128_aligned (
+                   (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+           }
+
+           pd += 4;
+           pm += 4;
+           w -= 4;
+       }
+
+       while (w)
+       {
+           m = *pm++;
+
+           if (m)
+           {
+               d = *pd;
+               mmx_mask = unpack_32_1x64 (m);
+               mmx_dest = unpack_32_1x64 (d);
+
+               *pd = pack_1x64_32 (
+                   in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
+           }
+
+           pd++;
+           w--;
+       }
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
-
-/* -------------------------------------------------------------------------------------------------
- * fast_composite_over_8888_n_8888
+/*---------------------------------------------------------------------
+ * composite_over_8888_n_8888
  */
 
 static void
 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
-                               pixman_op_t op,
-                              pixman_image_t * src_image,
-                              pixman_image_t * mask_image,
-                              pixman_image_t * dst_image,
-                              int32_t  src_x,
-                              int32_t  src_y,
-                              int32_t      mask_x,
-                              int32_t      mask_y,
-                              int32_t      dest_x,
-                              int32_t      dest_y,
-                              int32_t     width,
-                              int32_t     height)
-{
-    uint32_t   *dst_line, *dst;
-    uint32_t   *src_line, *src;
-    uint32_t   mask;
-    uint16_t   w;
-    int        dst_stride, src_stride;
+                                 pixman_op_t              op,
+                                 pixman_image_t *         src_image,
+                                 pixman_image_t *         mask_image,
+                                 pixman_image_t *         dst_image,
+                                 int32_t                  src_x,
+                                 int32_t                  src_y,
+                                 int32_t                  mask_x,
+                                 int32_t                  mask_y,
+                                 int32_t                  dest_x,
+                                 int32_t                  dest_y,
+                                 int32_t                  width,
+                                 int32_t                  height)
+{
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    uint32_t mask;
+    uint16_t w;
+    int dst_stride, src_stride;
 
     __m128i xmm_mask;
     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_alpha_lo, xmm_alpha_hi;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
     mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
 
     xmm_mask = create_mask_16_128 (mask >> 24);
 
     while (height--)
     {
-        dst = dst_line;
-        dst_line += dst_stride;
-        src = src_line;
-        src_line += src_stride;
-        w = width;
-
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)dst);
-        cache_prefetch ((__m128i*)src);
-
-        while (w && (unsigned long)dst & 15)
-        {
-            uint32_t s = *src++;
-            uint32_t d = *dst;
-
-            __m64 ms = unpack_32_1x64 (s);
-            __m64 alpha    = expand_alpha_1x64 (ms);
-            __m64 dest     = _mm_movepi64_pi64 (xmm_mask);
-            __m64 alpha_dst = unpack_32_1x64 (d);
-
-            *dst++ = pack_1x64_32 (in_over_1x64 (&ms,
-                                                &alpha,
-                                                &dest,
-                                                &alpha_dst));
-
-            w--;
-        }
-
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)dst);
-        cache_prefetch ((__m128i*)src);
-
-        while (w >= 4)
-        {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i*)dst);
-            cache_prefetch_next ((__m128i*)src);
-
-            xmm_src = load_128_unaligned ((__m128i*)src);
-            xmm_dst = load_128_aligned ((__m128i*)dst);
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)dst);
+       cache_prefetch ((__m128i*)src);
+
+       while (w && (unsigned long)dst & 15)
+       {
+           uint32_t s = *src++;
+           uint32_t d = *dst;
+
+           __m64 ms = unpack_32_1x64 (s);
+           __m64 alpha    = expand_alpha_1x64 (ms);
+           __m64 dest     = _mm_movepi64_pi64 (xmm_mask);
+           __m64 alpha_dst = unpack_32_1x64 (d);
+
+           *dst++ = pack_1x64_32 (
+               in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
+
+           w--;
+       }
+
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)dst);
+       cache_prefetch ((__m128i*)src);
+
+       while (w >= 4)
+       {
+           /* fill cache line with next memory */
+           cache_prefetch_next ((__m128i*)dst);
+           cache_prefetch_next ((__m128i*)src);
+
+           xmm_src = load_128_unaligned ((__m128i*)src);
+           xmm_dst = load_128_aligned ((__m128i*)dst);
 
-            unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
-            unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-            expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+           unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+           unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+           expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+                               &xmm_alpha_lo, &xmm_alpha_hi);
 
-            in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_mask, &xmm_mask, &xmm_dst_lo, &xmm_dst_hi);
+           in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+                          &xmm_alpha_lo, &xmm_alpha_hi,
+                          &xmm_mask, &xmm_mask,
+                          &xmm_dst_lo, &xmm_dst_hi);
 
-            save_128_aligned( (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+           save_128_aligned (
+               (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 
-            dst += 4;
-            src += 4;
-            w -= 4;
-        }
+           dst += 4;
+           src += 4;
+           w -= 4;
+       }
 
-        while (w)
-        {
-            uint32_t s = *src++;
-            uint32_t d = *dst;
+       while (w)
+       {
+           uint32_t s = *src++;
+           uint32_t d = *dst;
 
-            __m64 ms = unpack_32_1x64 (s);
-            __m64 alpha = expand_alpha_1x64 (ms);
-            __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
-            __m64 dest  = unpack_32_1x64 (d);
+           __m64 ms = unpack_32_1x64 (s);
+           __m64 alpha = expand_alpha_1x64 (ms);
+           __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
+           __m64 dest  = unpack_32_1x64 (d);
 
-            *dst++ = pack_1x64_32 (in_over_1x64 (&ms,
-                                                &alpha,
-                                                &mask,
-                                                &dest));
+           *dst++ = pack_1x64_32 (
+               in_over_1x64 (&ms, &alpha, &mask, &dest));
 
-            w--;
-        }
+           w--;
+       }
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fast_composite_over_x888_n_8888
+/* ---------------------------------------------------------------------
+ * composite_over_x888_n_8888
  */
 static void
 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
-                               pixman_op_t op,
-                              pixman_image_t * src_image,
-                              pixman_image_t * mask_image,
-                              pixman_image_t * dst_image,
-                              int32_t  src_x,
-                              int32_t  src_y,
-                              int32_t      mask_x,
-                              int32_t      mask_y,
-                              int32_t      dest_x,
-                              int32_t      dest_y,
-                              int32_t     width,
-                              int32_t     height)
-{
-    uint32_t   *dst_line, *dst;
-    uint32_t   *src_line, *src;
-    uint32_t   mask;
-    int        dst_stride, src_stride;
-    uint16_t   w;
+                                 pixman_op_t              op,
+                                 pixman_image_t *         src_image,
+                                 pixman_image_t *         mask_image,
+                                 pixman_image_t *         dst_image,
+                                 int32_t                  src_x,
+                                 int32_t                  src_y,
+                                 int32_t                  mask_x,
+                                 int32_t                  mask_y,
+                                 int32_t                  dest_x,
+                                 int32_t                  dest_y,
+                                 int32_t                  width,
+                                 int32_t                  height)
+{
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    uint32_t mask;
+    int dst_stride, src_stride;
+    uint16_t w;
 
     __m128i xmm_mask, xmm_alpha;
     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
     mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
 
     xmm_mask = create_mask_16_128 (mask >> 24);
@@ -2954,160 +3388,166 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
 
     while (height--)
     {
-        dst = dst_line;
-        dst_line += dst_stride;
-        src = src_line;
-        src_line += src_stride;
-        w = width;
-
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)dst);
-        cache_prefetch ((__m128i*)src);
-
-        while (w && (unsigned long)dst & 15)
-        {
-            uint32_t s = (*src++) | 0xff000000;
-            uint32_t d = *dst;
-
-            __m64 src   = unpack_32_1x64 (s);
-            __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
-            __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
-            __m64 dest  = unpack_32_1x64 (d);
-
-            *dst++ = pack_1x64_32 (in_over_1x64 (&src,
-                                                &alpha,
-                                                &mask,
-                                                &dest));
-
-            w--;
-        }
-
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)dst);
-        cache_prefetch ((__m128i*)src);
-
-        while (w >= 4)
-        {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i*)dst);
-            cache_prefetch_next ((__m128i*)src);
-
-            xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
-            xmm_dst = load_128_aligned ((__m128i*)dst);
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)dst);
+       cache_prefetch ((__m128i*)src);
+
+       while (w && (unsigned long)dst & 15)
+       {
+           uint32_t s = (*src++) | 0xff000000;
+           uint32_t d = *dst;
+
+           __m64 src   = unpack_32_1x64 (s);
+           __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
+           __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
+           __m64 dest  = unpack_32_1x64 (d);
+
+           *dst++ = pack_1x64_32 (
+               in_over_1x64 (&src, &alpha, &mask, &dest));
+
+           w--;
+       }
+
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)dst);
+       cache_prefetch ((__m128i*)src);
+
+       while (w >= 4)
+       {
+           /* fill cache line with next memory */
+           cache_prefetch_next ((__m128i*)dst);
+           cache_prefetch_next ((__m128i*)src);
+
+           xmm_src = _mm_or_si128 (
+               load_128_unaligned ((__m128i*)src), mask_ff000000);
+           xmm_dst = load_128_aligned ((__m128i*)dst);
 
-            unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
-            unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+           unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+           unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
 
-            in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha, &xmm_alpha, &xmm_mask, &xmm_mask, &xmm_dst_lo, &xmm_dst_hi);
+           in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+                          &xmm_alpha, &xmm_alpha,
+                          &xmm_mask, &xmm_mask,
+                          &xmm_dst_lo, &xmm_dst_hi);
 
-            save_128_aligned( (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+           save_128_aligned (
+               (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 
-            dst += 4;
-            src += 4;
-            w -= 4;
+           dst += 4;
+           src += 4;
+           w -= 4;
 
-        }
+       }
 
-        while (w)
-        {
-            uint32_t s = (*src++) | 0xff000000;
-            uint32_t d = *dst;
+       while (w)
+       {
+           uint32_t s = (*src++) | 0xff000000;
+           uint32_t d = *dst;
 
-            __m64 src  = unpack_32_1x64 (s);
-            __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
-            __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
-            __m64 dest  = unpack_32_1x64 (d);
+           __m64 src  = unpack_32_1x64 (s);
+           __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
+           __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
+           __m64 dest  = unpack_32_1x64 (d);
 
-            *dst++ = pack_1x64_32 (in_over_1x64 (&src,
-                                                &alpha,
-                                                &mask,
-                                                &dest));
+           *dst++ = pack_1x64_32 (
+               in_over_1x64 (&src, &alpha, &mask, &dest));
 
-            w--;
-        }
+           w--;
+       }
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fast_composite_over_8888_8888
+/* --------------------------------------------------------------------
+ * composite_over_8888_8888
  */
 static void
 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
-                             pixman_op_t op,
-                            pixman_image_t * src_image,
-                            pixman_image_t * mask_image,
-                            pixman_image_t * dst_image,
-                            int32_t    src_x,
-                            int32_t    src_y,
-                            int32_t      mask_x,
-                            int32_t      mask_y,
-                            int32_t      dest_x,
-                            int32_t      dest_y,
-                            int32_t     width,
-                            int32_t     height)
-{
-    int                dst_stride, src_stride;
-    uint32_t   *dst_line, *dst;
-    uint32_t   *src_line, *src;
-
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+                               pixman_op_t              op,
+                               pixman_image_t *         src_image,
+                               pixman_image_t *         mask_image,
+                               pixman_image_t *         dst_image,
+                               int32_t                  src_x,
+                               int32_t                  src_y,
+                               int32_t                  mask_x,
+                               int32_t                  mask_y,
+                               int32_t                  dest_x,
+                               int32_t                  dest_y,
+                               int32_t                  width,
+                               int32_t                  height)
+{
+    int dst_stride, src_stride;
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+
+    PIXMAN_IMAGE_GET_LINE (
+       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
     dst = dst_line;
     src = src_line;
 
     while (height--)
     {
-        core_combine_over_u_sse2 (dst, src, NULL, width);
+       core_combine_over_u_sse2 (dst, src, NULL, width);
 
-        dst += dst_stride;
-        src += src_stride;
+       dst += dst_stride;
+       src += src_stride;
     }
-    _mm_empty();
+    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fast_composite_over_8888_0565
+/* ------------------------------------------------------------------
+ * composite_over_8888_0565
  */
 static force_inline uint16_t
-fast_composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
+composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
 {
-    __m64       ms;
+    __m64 ms;
 
     ms = unpack_32_1x64 (src);
-    return pack_565_32_16( pack_1x64_32 (over_1x64 (ms,
-                                                   expand_alpha_1x64 (ms),
-                                                   expand565_16_1x64 (dst))));
+    return pack_565_32_16 (
+       pack_1x64_32 (
+           over_1x64 (
+               ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
 }
 
 static void
 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
-                             pixman_op_t op,
-                            pixman_image_t * src_image,
-                            pixman_image_t * mask_image,
-                            pixman_image_t * dst_image,
-                            int32_t      src_x,
-                            int32_t      src_y,
-                            int32_t      mask_x,
-                            int32_t      mask_y,
-                            int32_t      dest_x,
-                            int32_t      dest_y,
-                            int32_t     width,
-                            int32_t     height)
-{
-    uint16_t   *dst_line, *dst, d;
-    uint32_t   *src_line, *src, s;
-    int        dst_stride, src_stride;
-    uint16_t   w;
+                               pixman_op_t              op,
+                               pixman_image_t *         src_image,
+                               pixman_image_t *         mask_image,
+                               pixman_image_t *         dst_image,
+                               int32_t                  src_x,
+                               int32_t                  src_y,
+                               int32_t                  mask_x,
+                               int32_t                  mask_y,
+                               int32_t                  dest_x,
+                               int32_t                  dest_y,
+                               int32_t                  width,
+                               int32_t                  height)
+{
+    uint16_t    *dst_line, *dst, d;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    uint16_t w;
 
     __m128i xmm_alpha_lo, xmm_alpha_hi;
     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
 #if 0
     /* FIXME
@@ -3120,102 +3560,115 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
 
     while (height--)
     {
-        dst = dst_line;
-        src = src_line;
-
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)src);
-        cache_prefetch ((__m128i*)dst);
-
-        dst_line += dst_stride;
-        src_line += src_stride;
-        w = width;
-
-        /* Align dst on a 16-byte boundary */
-        while (w &&
-               ((unsigned long)dst & 15))
-        {
-            s = *src++;
-            d = *dst;
-
-            *dst++ = fast_composite_over_8888_0565pixel (s, d);
-            w--;
-        }
-
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)src);
-        cache_prefetch ((__m128i*)dst);
-
-        /* It's a 8 pixel loop */
-        while (w >= 8)
-        {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i*)src);
-            cache_prefetch_next ((__m128i*)dst);
-
-            /* I'm loading unaligned because I'm not sure about the address alignment. */
-            xmm_src = load_128_unaligned ((__m128i*) src);
-            xmm_dst = load_128_aligned ((__m128i*) dst);
-
-            /* Unpacking */
-            unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
-            unpack_565_128_4x128 (xmm_dst, &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
-            expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
-
-            /* I'm loading next 4 pixels from memory before to optimze the memory read. */
-            xmm_src = load_128_unaligned ((__m128i*) (src+4));
-
-            over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_dst0, &xmm_dst1);
-
-            /* Unpacking */
-            unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
-            expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
-
-            over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_dst2, &xmm_dst3);
-
-            save_128_aligned ((__m128i*)dst, pack_565_4x128_128 (&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
-
-            w -= 8;
-            dst += 8;
-            src += 8;
-        }
-
-        while (w--)
-        {
-            s = *src++;
-            d = *dst;
-
-            *dst++ = fast_composite_over_8888_0565pixel (s, d);
-        }
+       dst = dst_line;
+       src = src_line;
+
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)src);
+       cache_prefetch ((__m128i*)dst);
+
+       dst_line += dst_stride;
+       src_line += src_stride;
+       w = width;
+
+       /* Align dst on a 16-byte boundary */
+       while (w &&
+              ((unsigned long)dst & 15))
+       {
+           s = *src++;
+           d = *dst;
+
+           *dst++ = composite_over_8888_0565pixel (s, d);
+           w--;
+       }
+
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)src);
+       cache_prefetch ((__m128i*)dst);
+
+       /* It's a 8 pixel loop */
+       while (w >= 8)
+       {
+           /* fill cache line with next memory */
+           cache_prefetch_next ((__m128i*)src);
+           cache_prefetch_next ((__m128i*)dst);
+
+           /* I'm loading unaligned because I'm not sure
+            * about the address alignment.
+            */
+           xmm_src = load_128_unaligned ((__m128i*) src);
+           xmm_dst = load_128_aligned ((__m128i*) dst);
+
+           /* Unpacking */
+           unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+           unpack_565_128_4x128 (xmm_dst,
+                                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+           expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+                               &xmm_alpha_lo, &xmm_alpha_hi);
+
+           /* I'm loading next 4 pixels from memory
+            * before to optimze the memory read.
+            */
+           xmm_src = load_128_unaligned ((__m128i*) (src + 4));
+
+           over_2x128 (&xmm_src_lo, &xmm_src_hi,
+                       &xmm_alpha_lo, &xmm_alpha_hi,
+                       &xmm_dst0, &xmm_dst1);
+
+           /* Unpacking */
+           unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+           expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+                               &xmm_alpha_lo, &xmm_alpha_hi);
+
+           over_2x128 (&xmm_src_lo, &xmm_src_hi,
+                       &xmm_alpha_lo, &xmm_alpha_hi,
+                       &xmm_dst2, &xmm_dst3);
+
+           save_128_aligned (
+               (__m128i*)dst, pack_565_4x128_128 (
+                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+           w -= 8;
+           dst += 8;
+           src += 8;
+       }
+
+       while (w--)
+       {
+           s = *src++;
+           d = *dst;
+
+           *dst++ = composite_over_8888_0565pixel (s, d);
+       }
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fast_composite_over_n_8_8888
+/* -----------------------------------------------------------------
+ * composite_over_n_8_8888
  */
 
 static void
 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
-                                  pixman_op_t op,
-                                 pixman_image_t * src_image,
-                                 pixman_image_t * mask_image,
-                                 pixman_image_t * dst_image,
-                                 int32_t      src_x,
-                                 int32_t      src_y,
-                                 int32_t      mask_x,
-                                 int32_t      mask_y,
-                                 int32_t      dest_x,
-                                 int32_t      dest_y,
-                                 int32_t     width,
-                                 int32_t     height)
-{
-    uint32_t   src, srca;
-    uint32_t   *dst_line, *dst;
-    uint8_t    *mask_line, *mask;
-    int        dst_stride, mask_stride;
-    uint16_t   w;
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
+{
+    uint32_t src, srca;
+    uint32_t *dst_line, *dst;
+    uint8_t *mask_line, *mask;
+    int dst_stride, mask_stride;
+    uint16_t w;
     uint32_t m, d;
 
     __m128i xmm_src, xmm_alpha, xmm_def;
@@ -3224,14 +3677,16 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
 
     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
 
-    src = _pixman_image_get_solid(src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     srca = src >> 24;
     if (src == 0)
        return;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
     xmm_def = create_mask_2x32_128 (src, src);
     xmm_src = expand_pixel_32_1x128 (src);
@@ -3241,114 +3696,119 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
 
     while (height--)
     {
-        dst = dst_line;
-        dst_line += dst_stride;
-        mask = mask_line;
-        mask_line += mask_stride;
-        w = width;
-
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)mask);
-        cache_prefetch ((__m128i*)dst);
-
-        while (w && (unsigned long)dst & 15)
-        {
-            uint8_t m = *mask++;
-
-            if (m)
-            {
-                d = *dst;
-                mmx_mask = expand_pixel_8_1x64 (m);
-                mmx_dest = unpack_32_1x64 (d);
-
-                *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
-                                                  &mmx_alpha,
-                                                  &mmx_mask,
-                                                  &mmx_dest));
-            }
-
-            w--;
-            dst++;
-        }
-
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)mask);
-        cache_prefetch ((__m128i*)dst);
-
-        while (w >= 4)
-        {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i*)mask);
-            cache_prefetch_next ((__m128i*)dst);
-
-            m = *((uint32_t*)mask);
-
-            if (srca == 0xff && m == 0xffffffff)
-            {
-                save_128_aligned ((__m128i*)dst, xmm_def);
-            }
-            else if (m)
-            {
-                xmm_dst = load_128_aligned ((__m128i*) dst);
-                xmm_mask = unpack_32_1x128 (m);
-                xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128());
-
-                /* Unpacking */
-                unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-                unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
-                expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
-                in_over_2x128 (&xmm_src, &xmm_src, &xmm_alpha, &xmm_alpha, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
-
-                save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-            }
-
-            w -= 4;
-            dst += 4;
-            mask += 4;
-        }
-
-        while (w)
-        {
-            uint8_t m = *mask++;
-
-            if (m)
-            {
-                d = *dst;
-                mmx_mask = expand_pixel_8_1x64 (m);
-                mmx_dest = unpack_32_1x64 (d);
-
-                *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
-                                                  &mmx_alpha,
-                                                  &mmx_mask,
-                                                  &mmx_dest));
-            }
-
-            w--;
-            dst++;
-        }
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
+       w = width;
+
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)mask);
+       cache_prefetch ((__m128i*)dst);
+
+       while (w && (unsigned long)dst & 15)
+       {
+           uint8_t m = *mask++;
+
+           if (m)
+           {
+               d = *dst;
+               mmx_mask = expand_pixel_8_1x64 (m);
+               mmx_dest = unpack_32_1x64 (d);
+
+               *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
+                                                  &mmx_alpha,
+                                                  &mmx_mask,
+                                                  &mmx_dest));
+           }
+
+           w--;
+           dst++;
+       }
+
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)mask);
+       cache_prefetch ((__m128i*)dst);
+
+       while (w >= 4)
+       {
+           /* fill cache line with next memory */
+           cache_prefetch_next ((__m128i*)mask);
+           cache_prefetch_next ((__m128i*)dst);
+
+           m = *((uint32_t*)mask);
+
+           if (srca == 0xff && m == 0xffffffff)
+           {
+               save_128_aligned ((__m128i*)dst, xmm_def);
+           }
+           else if (m)
+           {
+               xmm_dst = load_128_aligned ((__m128i*) dst);
+               xmm_mask = unpack_32_1x128 (m);
+               xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+               /* Unpacking */
+               unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+               unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+               expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+                                       &xmm_mask_lo, &xmm_mask_hi);
+
+               in_over_2x128 (&xmm_src, &xmm_src,
+                              &xmm_alpha, &xmm_alpha,
+                              &xmm_mask_lo, &xmm_mask_hi,
+                              &xmm_dst_lo, &xmm_dst_hi);
+
+               save_128_aligned (
+                   (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+           }
+
+           w -= 4;
+           dst += 4;
+           mask += 4;
+       }
+
+       while (w)
+       {
+           uint8_t m = *mask++;
+
+           if (m)
+           {
+               d = *dst;
+               mmx_mask = expand_pixel_8_1x64 (m);
+               mmx_dest = unpack_32_1x64 (d);
+
+               *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
+                                                  &mmx_alpha,
+                                                  &mmx_mask,
+                                                  &mmx_dest));
+           }
+
+           w--;
+           dst++;
+       }
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fast_composite_over_n_8_8888
+/* ----------------------------------------------------------------
+ * composite_over_n_8_8888
  */
 
 pixman_bool_t
 pixman_fill_sse2 (uint32_t *bits,
-                int stride,
-                int bpp,
-                int x,
-                int y,
-                int width,
-                int height,
-                uint32_t data)
+                  int       stride,
+                  int       bpp,
+                  int       x,
+                  int       y,
+                  int       width,
+                  int       height,
+                  uint32_t  data)
 {
-    uint32_t   byte_width;
-    uint8_t        *byte_line;
+    uint32_t byte_width;
+    uint8_t         *byte_line;
 
     __m128i xmm_def;
 
@@ -3360,17 +3820,17 @@ pixman_fill_sse2 (uint32_t *bits,
 
     if (bpp == 16)
     {
-        stride = stride * (int) sizeof (uint32_t) / 2;
-        byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
-        byte_width = 2 * width;
-        stride *= 2;
+       stride = stride * (int) sizeof (uint32_t) / 2;
+       byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
+       byte_width = 2 * width;
+       stride *= 2;
     }
     else
     {
-        stride = stride * (int) sizeof (uint32_t) / 4;
-        byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
-        byte_width = 4 * width;
-        stride *= 4;
+       stride = stride * (int) sizeof (uint32_t) / 4;
+       byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
+       byte_width = 4 * width;
+       stride *= 4;
     }
 
     cache_prefetch ((__m128i*)byte_line);
@@ -3378,258 +3838,268 @@ pixman_fill_sse2 (uint32_t *bits,
 
     while (height--)
     {
-        int w;
-        uint8_t *d = byte_line;
-        byte_line += stride;
-        w = byte_width;
-
-
-        cache_prefetch_next ((__m128i*)d);
-
-        while (w >= 2 && ((unsigned long)d & 3))
-        {
-            *(uint16_t *)d = data;
-            w -= 2;
-            d += 2;
-        }
-
-        while (w >= 4 && ((unsigned long)d & 15))
-        {
-            *(uint32_t *)d = data;
-
-            w -= 4;
-            d += 4;
-        }
-
-        cache_prefetch_next ((__m128i*)d);
-
-        while (w >= 128)
-        {
-            cache_prefetch (((__m128i*)d) + 12);
-
-            save_128_aligned ((__m128i*)(d),     xmm_def);
-            save_128_aligned ((__m128i*)(d+16),  xmm_def);
-            save_128_aligned ((__m128i*)(d+32),  xmm_def);
-            save_128_aligned ((__m128i*)(d+48),  xmm_def);
-            save_128_aligned ((__m128i*)(d+64),  xmm_def);
-            save_128_aligned ((__m128i*)(d+80),  xmm_def);
-            save_128_aligned ((__m128i*)(d+96),  xmm_def);
-            save_128_aligned ((__m128i*)(d+112), xmm_def);
-
-            d += 128;
-            w -= 128;
-        }
-
-        if (w >= 64)
-        {
-            cache_prefetch (((__m128i*)d) + 8);
-
-            save_128_aligned ((__m128i*)(d),     xmm_def);
-            save_128_aligned ((__m128i*)(d+16),  xmm_def);
-            save_128_aligned ((__m128i*)(d+32),  xmm_def);
-            save_128_aligned ((__m128i*)(d+48),  xmm_def);
-
-            d += 64;
-            w -= 64;
-        }
-
-        cache_prefetch_next ((__m128i*)d);
-
-        if (w >= 32)
-        {
-            save_128_aligned ((__m128i*)(d),     xmm_def);
-            save_128_aligned ((__m128i*)(d+16),  xmm_def);
-
-            d += 32;
-            w -= 32;
-        }
-
-        if (w >= 16)
-        {
-            save_128_aligned ((__m128i*)(d),     xmm_def);
-
-            d += 16;
-            w -= 16;
-        }
-
-        cache_prefetch_next ((__m128i*)d);
-
-        while (w >= 4)
-        {
-            *(uint32_t *)d = data;
-
-            w -= 4;
-            d += 4;
-        }
-
-        if (w >= 2)
-        {
-            *(uint16_t *)d = data;
-            w -= 2;
-            d += 2;
-        }
+       int w;
+       uint8_t *d = byte_line;
+       byte_line += stride;
+       w = byte_width;
+
+
+       cache_prefetch_next ((__m128i*)d);
+
+       while (w >= 2 && ((unsigned long)d & 3))
+       {
+           *(uint16_t *)d = data;
+           w -= 2;
+           d += 2;
+       }
+
+       while (w >= 4 && ((unsigned long)d & 15))
+       {
+           *(uint32_t *)d = data;
+
+           w -= 4;
+           d += 4;
+       }
+
+       cache_prefetch_next ((__m128i*)d);
+
+       while (w >= 128)
+       {
+           cache_prefetch (((__m128i*)d) + 12);
+
+           save_128_aligned ((__m128i*)(d),     xmm_def);
+           save_128_aligned ((__m128i*)(d + 16),  xmm_def);
+           save_128_aligned ((__m128i*)(d + 32),  xmm_def);
+           save_128_aligned ((__m128i*)(d + 48),  xmm_def);
+           save_128_aligned ((__m128i*)(d + 64),  xmm_def);
+           save_128_aligned ((__m128i*)(d + 80),  xmm_def);
+           save_128_aligned ((__m128i*)(d + 96),  xmm_def);
+           save_128_aligned ((__m128i*)(d + 112), xmm_def);
+
+           d += 128;
+           w -= 128;
+       }
+
+       if (w >= 64)
+       {
+           cache_prefetch (((__m128i*)d) + 8);
+
+           save_128_aligned ((__m128i*)(d),     xmm_def);
+           save_128_aligned ((__m128i*)(d + 16),  xmm_def);
+           save_128_aligned ((__m128i*)(d + 32),  xmm_def);
+           save_128_aligned ((__m128i*)(d + 48),  xmm_def);
+
+           d += 64;
+           w -= 64;
+       }
+
+       cache_prefetch_next ((__m128i*)d);
+
+       if (w >= 32)
+       {
+           save_128_aligned ((__m128i*)(d),     xmm_def);
+           save_128_aligned ((__m128i*)(d + 16),  xmm_def);
+
+           d += 32;
+           w -= 32;
+       }
+
+       if (w >= 16)
+       {
+           save_128_aligned ((__m128i*)(d),     xmm_def);
+
+           d += 16;
+           w -= 16;
+       }
+
+       cache_prefetch_next ((__m128i*)d);
+
+       while (w >= 4)
+       {
+           *(uint32_t *)d = data;
+
+           w -= 4;
+           d += 4;
+       }
+
+       if (w >= 2)
+       {
+           *(uint16_t *)d = data;
+           w -= 2;
+           d += 2;
+       }
     }
 
-    _mm_empty();
+    _mm_empty ();
     return TRUE;
 }
 
 static void
 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
-                                     pixman_op_t op,
-                                    pixman_image_t * src_image,
-                                    pixman_image_t * mask_image,
-                                    pixman_image_t * dst_image,
-                                    int32_t      src_x,
-                                    int32_t      src_y,
-                                    int32_t      mask_x,
-                                    int32_t      mask_y,
-                                    int32_t      dest_x,
-                                    int32_t      dest_y,
-                                    int32_t     width,
-                                    int32_t     height)
-{
-    uint32_t   src, srca;
-    uint32_t   *dst_line, *dst;
-    uint8_t    *mask_line, *mask;
-    int        dst_stride, mask_stride;
-    uint16_t   w;
-    uint32_t    m;
+                             pixman_op_t              op,
+                             pixman_image_t *         src_image,
+                             pixman_image_t *         mask_image,
+                             pixman_image_t *         dst_image,
+                             int32_t                  src_x,
+                             int32_t                  src_y,
+                             int32_t                  mask_x,
+                             int32_t                  mask_y,
+                             int32_t                  dest_x,
+                             int32_t                  dest_y,
+                             int32_t                  width,
+                             int32_t                  height)
+{
+    uint32_t src, srca;
+    uint32_t    *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    uint16_t w;
+    uint32_t m;
 
     __m128i xmm_src, xmm_def;
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
 
-    src = _pixman_image_get_solid(src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     srca = src >> 24;
     if (src == 0)
     {
-        pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
-                        PIXMAN_FORMAT_BPP (dst_image->bits.format),
-                        dest_x, dest_y, width, height, 0);
-        return;
+       pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
+                         PIXMAN_FORMAT_BPP (dst_image->bits.format),
+                         dest_x, dest_y, width, height, 0);
+       return;
     }
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
     xmm_def = create_mask_2x32_128 (src, src);
     xmm_src = expand_pixel_32_1x128 (src);
 
     while (height--)
     {
-        dst = dst_line;
-        dst_line += dst_stride;
-        mask = mask_line;
-        mask_line += mask_stride;
-        w = width;
-
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)mask);
-        cache_prefetch ((__m128i*)dst);
-
-        while (w && (unsigned long)dst & 15)
-        {
-            uint8_t m = *mask++;
-
-            if (m)
-            {
-                *dst = pack_1x64_32 (pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
-            }
-            else
-            {
-                *dst = 0;
-            }
-
-            w--;
-            dst++;
-        }
-
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)mask);
-        cache_prefetch ((__m128i*)dst);
-
-        while (w >= 4)
-        {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i*)mask);
-            cache_prefetch_next ((__m128i*)dst);
-
-            m = *((uint32_t*)mask);
-
-            if (srca == 0xff && m == 0xffffffff)
-            {
-                save_128_aligned ((__m128i*)dst, xmm_def);
-            }
-            else if (m)
-            {
-                xmm_mask = unpack_32_1x128 (m);
-                xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128());
-
-                /* Unpacking */
-                unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
-                expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
-                pix_multiply_2x128 (&xmm_src, &xmm_src, &xmm_mask_lo, &xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
-                save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
-            }
-            else
-            {
-                save_128_aligned ((__m128i*)dst, _mm_setzero_si128());
-            }
-
-            w -= 4;
-            dst += 4;
-            mask += 4;
-        }
-
-        while (w)
-        {
-            uint8_t m = *mask++;
-
-            if (m)
-            {
-                *dst = pack_1x64_32 (pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
-            }
-            else
-            {
-                *dst = 0;
-            }
-
-            w--;
-            dst++;
-        }
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
+       w = width;
+
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)mask);
+       cache_prefetch ((__m128i*)dst);
+
+       while (w && (unsigned long)dst & 15)
+       {
+           uint8_t m = *mask++;
+
+           if (m)
+           {
+               *dst = pack_1x64_32 (
+                   pix_multiply_1x64 (
+                       _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
+           }
+           else
+           {
+               *dst = 0;
+           }
+
+           w--;
+           dst++;
+       }
+
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)mask);
+       cache_prefetch ((__m128i*)dst);
+
+       while (w >= 4)
+       {
+           /* fill cache line with next memory */
+           cache_prefetch_next ((__m128i*)mask);
+           cache_prefetch_next ((__m128i*)dst);
+
+           m = *((uint32_t*)mask);
+
+           if (srca == 0xff && m == 0xffffffff)
+           {
+               save_128_aligned ((__m128i*)dst, xmm_def);
+           }
+           else if (m)
+           {
+               xmm_mask = unpack_32_1x128 (m);
+               xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+               /* Unpacking */
+               unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+               expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+                                       &xmm_mask_lo, &xmm_mask_hi);
+
+               pix_multiply_2x128 (&xmm_src, &xmm_src,
+                                   &xmm_mask_lo, &xmm_mask_hi,
+                                   &xmm_mask_lo, &xmm_mask_hi);
+
+               save_128_aligned (
+                   (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
+           }
+           else
+           {
+               save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
+           }
+
+           w -= 4;
+           dst += 4;
+           mask += 4;
+       }
+
+       while (w)
+       {
+           uint8_t m = *mask++;
+
+           if (m)
+           {
+               *dst = pack_1x64_32 (
+                   pix_multiply_1x64 (
+                       _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
+           }
+           else
+           {
+               *dst = 0;
+           }
+
+           w--;
+           dst++;
+       }
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fast_composite_over_n_8_0565
+/*-----------------------------------------------------------------------
+ * composite_over_n_8_0565
  */
 
 static void
 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
-                                  pixman_op_t op,
-                                 pixman_image_t * src_image,
-                                 pixman_image_t * mask_image,
-                                 pixman_image_t * dst_image,
-                                 int32_t      src_x,
-                                 int32_t      src_y,
-                                 int32_t      mask_x,
-                                 int32_t      mask_y,
-                                 int32_t      dest_x,
-                                 int32_t      dest_y,
-                                 int32_t     width,
-                                 int32_t     height)
-{
-    uint32_t   src, srca;
-    uint16_t   *dst_line, *dst, d;
-    uint8_t    *mask_line, *mask;
-    int        dst_stride, mask_stride;
-    uint16_t   w;
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
+{
+    uint32_t src, srca;
+    uint16_t    *dst_line, *dst, d;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    uint16_t w;
     uint32_t m;
     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
 
@@ -3637,14 +4107,16 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
 
-    src = _pixman_image_get_solid(src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     srca = src >> 24;
     if (src == 0)
        return;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
     xmm_src = expand_pixel_32_1x128 (src);
     xmm_alpha = expand_alpha_1x128 (xmm_src);
@@ -3653,140 +4125,154 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
 
     while (height--)
     {
-        dst = dst_line;
-        dst_line += dst_stride;
-        mask = mask_line;
-        mask_line += mask_stride;
-        w = width;
-
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)mask);
-        cache_prefetch ((__m128i*)dst);
-
-        while (w && (unsigned long)dst & 15)
-        {
-            m = *mask++;
-
-            if (m)
-            {
-                d = *dst;
-                mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
-                mmx_dest = expand565_16_1x64 (d);
-
-                *dst = pack_565_32_16 (pack_1x64_32 (in_over_1x64 (&mmx_src,
-                                                                 &mmx_alpha,
-                                                                 &mmx_mask,
-                                                                 &mmx_dest)));
-            }
-
-            w--;
-            dst++;
-        }
-
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)mask);
-        cache_prefetch ((__m128i*)dst);
-
-        while (w >= 8)
-        {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i*)mask);
-            cache_prefetch_next ((__m128i*)dst);
-
-            xmm_dst = load_128_aligned ((__m128i*) dst);
-            unpack_565_128_4x128 (xmm_dst, &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
-
-            m = *((uint32_t*)mask);
-            mask += 4;
-
-            if (m)
-            {
-                xmm_mask = unpack_32_1x128 (m);
-                xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128());
-
-                /* Unpacking */
-                unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
-                expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-                in_over_2x128 (&xmm_src, &xmm_src, &xmm_alpha, &xmm_alpha, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst0, &xmm_dst1);
-            }
-
-            m = *((uint32_t*)mask);
-            mask += 4;
-
-            if (m)
-            {
-                xmm_mask = unpack_32_1x128 (m);
-                xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128());
-
-                /* Unpacking */
-                unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
-                expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-                in_over_2x128 (&xmm_src, &xmm_src, &xmm_alpha, &xmm_alpha, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst2, &xmm_dst3);
-            }
-
-            save_128_aligned ((__m128i*)dst, pack_565_4x128_128 (&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
-
-            w -= 8;
-            dst += 8;
-        }
-
-        while (w)
-        {
-            m = *mask++;
-
-            if (m)
-            {
-                d = *dst;
-                mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
-                mmx_dest = expand565_16_1x64 (d);
-
-                *dst = pack_565_32_16 (pack_1x64_32 (in_over_1x64 (&mmx_src,
-                                                                 &mmx_alpha,
-                                                                 &mmx_mask,
-                                                                 &mmx_dest)));
-            }
-
-            w--;
-            dst++;
-        }
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
+       w = width;
+
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)mask);
+       cache_prefetch ((__m128i*)dst);
+
+       while (w && (unsigned long)dst & 15)
+       {
+           m = *mask++;
+
+           if (m)
+           {
+               d = *dst;
+               mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
+               mmx_dest = expand565_16_1x64 (d);
+
+               *dst = pack_565_32_16 (
+                   pack_1x64_32 (
+                       in_over_1x64 (
+                           &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+           }
+
+           w--;
+           dst++;
+       }
+
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)mask);
+       cache_prefetch ((__m128i*)dst);
+
+       while (w >= 8)
+       {
+           /* fill cache line with next memory */
+           cache_prefetch_next ((__m128i*)mask);
+           cache_prefetch_next ((__m128i*)dst);
+
+           xmm_dst = load_128_aligned ((__m128i*) dst);
+           unpack_565_128_4x128 (xmm_dst,
+                                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+           m = *((uint32_t*)mask);
+           mask += 4;
+
+           if (m)
+           {
+               xmm_mask = unpack_32_1x128 (m);
+               xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+               /* Unpacking */
+               unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+               expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+                                       &xmm_mask_lo, &xmm_mask_hi);
+
+               in_over_2x128 (&xmm_src, &xmm_src,
+                              &xmm_alpha, &xmm_alpha,
+                              &xmm_mask_lo, &xmm_mask_hi,
+                              &xmm_dst0, &xmm_dst1);
+           }
+
+           m = *((uint32_t*)mask);
+           mask += 4;
+
+           if (m)
+           {
+               xmm_mask = unpack_32_1x128 (m);
+               xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+               /* Unpacking */
+               unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+               expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+                                       &xmm_mask_lo, &xmm_mask_hi);
+               in_over_2x128 (&xmm_src, &xmm_src,
+                              &xmm_alpha, &xmm_alpha,
+                              &xmm_mask_lo, &xmm_mask_hi,
+                              &xmm_dst2, &xmm_dst3);
+           }
+
+           save_128_aligned (
+               (__m128i*)dst, pack_565_4x128_128 (
+                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+           w -= 8;
+           dst += 8;
+       }
+
+       while (w)
+       {
+           m = *mask++;
+
+           if (m)
+           {
+               d = *dst;
+               mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
+               mmx_dest = expand565_16_1x64 (d);
+
+               *dst = pack_565_32_16 (
+                   pack_1x64_32 (
+                       in_over_1x64 (
+                           &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+           }
+
+           w--;
+           dst++;
+       }
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fast_composite_over_pixbuf_0565
+/* -----------------------------------------------------------------------
+ * composite_over_pixbuf_0565
  */
 
 static void
 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
-                                  pixman_op_t op,
-                                 pixman_image_t * src_image,
-                                 pixman_image_t * mask_image,
-                                 pixman_image_t * dst_image,
-                                 int32_t      src_x,
-                                 int32_t      src_y,
-                                 int32_t      mask_x,
-                                 int32_t      mask_y,
-                                 int32_t      dest_x,
-                                 int32_t      dest_y,
-                                 int32_t     width,
-                                 int32_t     height)
-{
-    uint16_t   *dst_line, *dst, d;
-    uint32_t   *src_line, *src, s;
-    int                dst_stride, src_stride;
-    uint16_t   w;
-    uint32_t    opaque, zero;
+                                 pixman_op_t              op,
+                                 pixman_image_t *         src_image,
+                                 pixman_image_t *         mask_image,
+                                 pixman_image_t *         dst_image,
+                                 int32_t                  src_x,
+                                 int32_t                  src_y,
+                                 int32_t                  mask_x,
+                                 int32_t                  mask_y,
+                                 int32_t                  dest_x,
+                                 int32_t                  dest_y,
+                                 int32_t                  width,
+                                 int32_t                  height)
+{
+    uint16_t    *dst_line, *dst, d;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    uint16_t w;
+    uint32_t opaque, zero;
 
     __m64 ms;
     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
 #if 0
     /* FIXME
@@ -3799,126 +4285,139 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
 
     while (height--)
     {
-        dst = dst_line;
-        dst_line += dst_stride;
-        src = src_line;
-        src_line += src_stride;
-        w = width;
-
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)src);
-        cache_prefetch ((__m128i*)dst);
-
-        while (w && (unsigned long)dst & 15)
-        {
-            s = *src++;
-            d = *dst;
-
-            ms = unpack_32_1x64 (s);
-
-            *dst++ = pack_565_32_16 (pack_1x64_32 (over_rev_non_pre_1x64(ms, expand565_16_1x64 (d))));
-            w--;
-        }
-
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)src);
-        cache_prefetch ((__m128i*)dst);
-
-        while (w >= 8)
-        {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i*)src);
-            cache_prefetch_next ((__m128i*)dst);
-
-            /* First round */
-            xmm_src = load_128_unaligned((__m128i*)src);
-            xmm_dst = load_128_aligned  ((__m128i*)dst);
-
-            opaque = is_opaque (xmm_src);
-           zero = is_zero (xmm_src);
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)src);
+       cache_prefetch ((__m128i*)dst);
+
+       while (w && (unsigned long)dst & 15)
+       {
+           s = *src++;
+           d = *dst;
+
+           ms = unpack_32_1x64 (s);
+
+           *dst++ = pack_565_32_16 (
+               pack_1x64_32 (
+                   over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
+           w--;
+       }
+
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)src);
+       cache_prefetch ((__m128i*)dst);
+
+       while (w >= 8)
+       {
+           /* fill cache line with next memory */
+           cache_prefetch_next ((__m128i*)src);
+           cache_prefetch_next ((__m128i*)dst);
+
+           /* First round */
+           xmm_src = load_128_unaligned ((__m128i*)src);
+           xmm_dst = load_128_aligned  ((__m128i*)dst);
 
-           unpack_565_128_4x128 (xmm_dst, &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
-            unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
-
-            /* preload next round*/
-            xmm_src = load_128_unaligned((__m128i*)(src+4));
-           
-            if (opaque)
-            {
-                invert_colors_2x128 (xmm_src_lo, xmm_src_hi, &xmm_dst0, &xmm_dst1);
-            }
-            else if (!zero)
-            {
-                over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, &xmm_dst0, &xmm_dst1);
-            }
-
-            /* Second round */
            opaque = is_opaque (xmm_src);
            zero = is_zero (xmm_src);
 
-            unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
-
-            if (opaque)
-            {
-                invert_colors_2x128 (xmm_src_lo, xmm_src_hi, &xmm_dst2, &xmm_dst3);
-            }
-            else if (zero)
-            {
-                over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, &xmm_dst2, &xmm_dst3);
-            }
-
-            save_128_aligned ((__m128i*)dst, pack_565_4x128_128 (&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
-
-            w -= 8;
-            src += 8;
-            dst += 8;
-        }
-
-        while (w)
-        {
-            s = *src++;
-            d = *dst;
-
-            ms = unpack_32_1x64 (s);
+           unpack_565_128_4x128 (xmm_dst,
+                                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+           unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+
+           /* preload next round*/
+           xmm_src = load_128_unaligned ((__m128i*)(src + 4));
+
+           if (opaque)
+           {
+               invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+                                    &xmm_dst0, &xmm_dst1);
+           }
+           else if (!zero)
+           {
+               over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+                                       &xmm_dst0, &xmm_dst1);
+           }
+
+           /* Second round */
+           opaque = is_opaque (xmm_src);
+           zero = is_zero (xmm_src);
 
-            *dst++ = pack_565_32_16 (pack_1x64_32 (over_rev_non_pre_1x64(ms, expand565_16_1x64 (d))));
-            w--;
-        }
+           unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+
+           if (opaque)
+           {
+               invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+                                    &xmm_dst2, &xmm_dst3);
+           }
+           else if (zero)
+           {
+               over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+                                       &xmm_dst2, &xmm_dst3);
+           }
+
+           save_128_aligned (
+               (__m128i*)dst, pack_565_4x128_128 (
+                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+           w -= 8;
+           src += 8;
+           dst += 8;
+       }
+
+       while (w)
+       {
+           s = *src++;
+           d = *dst;
+
+           ms = unpack_32_1x64 (s);
+
+           *dst++ = pack_565_32_16 (
+               pack_1x64_32 (
+                   over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
+           w--;
+       }
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fast_composite_over_pixbuf_8888
+/* -------------------------------------------------------------------------
+ * composite_over_pixbuf_8888
  */
 
 static void
 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
-                                  pixman_op_t op,
-                                 pixman_image_t * src_image,
-                                 pixman_image_t * mask_image,
-                                 pixman_image_t * dst_image,
-                                 int32_t      src_x,
-                                 int32_t      src_y,
-                                 int32_t      mask_x,
-                                 int32_t      mask_y,
-                                 int32_t      dest_x,
-                                 int32_t      dest_y,
-                                 int32_t     width,
-                                 int32_t     height)
-{
-    uint32_t   *dst_line, *dst, d;
-    uint32_t   *src_line, *src, s;
-    int        dst_stride, src_stride;
-    uint16_t   w;
-    uint32_t    opaque, zero;
+                                 pixman_op_t              op,
+                                 pixman_image_t *         src_image,
+                                 pixman_image_t *         mask_image,
+                                 pixman_image_t *         dst_image,
+                                 int32_t                  src_x,
+                                 int32_t                  src_y,
+                                 int32_t                  mask_x,
+                                 int32_t                  mask_y,
+                                 int32_t                  dest_x,
+                                 int32_t                  dest_y,
+                                 int32_t                  width,
+                                 int32_t                  height)
+{
+    uint32_t    *dst_line, *dst, d;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    uint16_t w;
+    uint32_t opaque, zero;
 
     __m128i xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst_lo, xmm_dst_hi;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
 #if 0
     /* FIXME
@@ -3931,102 +4430,110 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
 
     while (height--)
     {
-        dst = dst_line;
-        dst_line += dst_stride;
-        src = src_line;
-        src_line += src_stride;
-        w = width;
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)src);
-        cache_prefetch ((__m128i*)dst);
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)src);
+       cache_prefetch ((__m128i*)dst);
 
-        while (w && (unsigned long)dst & 15)
-        {
-            s = *src++;
-            d = *dst;
+       while (w && (unsigned long)dst & 15)
+       {
+           s = *src++;
+           d = *dst;
 
-            *dst++ = pack_1x64_32 (over_rev_non_pre_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
+           *dst++ = pack_1x64_32 (
+               over_rev_non_pre_1x64 (
+                   unpack_32_1x64 (s), unpack_32_1x64 (d)));
 
-            w--;
-        }
+           w--;
+       }
 
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)src);
-        cache_prefetch ((__m128i*)dst);
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)src);
+       cache_prefetch ((__m128i*)dst);
 
-        while (w >= 4)
-        {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i*)src);
-            cache_prefetch_next ((__m128i*)dst);
+       while (w >= 4)
+       {
+           /* fill cache line with next memory */
+           cache_prefetch_next ((__m128i*)src);
+           cache_prefetch_next ((__m128i*)dst);
 
-            xmm_src_hi = load_128_unaligned((__m128i*)src);
+           xmm_src_hi = load_128_unaligned ((__m128i*)src);
 
-            opaque = is_opaque (xmm_src_hi);
+           opaque = is_opaque (xmm_src_hi);
            zero = is_zero (xmm_src_hi);
 
-            unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+           unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 
-            if (opaque)
-            {
-                invert_colors_2x128( xmm_src_lo, xmm_src_hi, &xmm_dst_lo, &xmm_dst_hi);
+           if (opaque)
+           {
+               invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+                                    &xmm_dst_lo, &xmm_dst_hi);
 
-                save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-            }
-            else if (!zero)
-            {
-                xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
+               save_128_aligned (
+                   (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+           }
+           else if (!zero)
+           {
+               xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
 
-                unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+               unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 
-                over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, &xmm_dst_lo, &xmm_dst_hi);
+               over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+                                       &xmm_dst_lo, &xmm_dst_hi);
 
-                save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-            }
+               save_128_aligned (
+                   (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+           }
 
-            w -= 4;
-            dst += 4;
-            src += 4;
-        }
+           w -= 4;
+           dst += 4;
+           src += 4;
+       }
 
-        while (w)
-        {
-            s = *src++;
-            d = *dst;
+       while (w)
+       {
+           s = *src++;
+           d = *dst;
 
-            *dst++ = pack_1x64_32 (over_rev_non_pre_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
+           *dst++ = pack_1x64_32 (
+               over_rev_non_pre_1x64 (
+                   unpack_32_1x64 (s), unpack_32_1x64 (d)));
 
-            w--;
-        }
+           w--;
+       }
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
 /* -------------------------------------------------------------------------------------------------
- * fast_composite_over_n_8888_0565_ca
+ * composite_over_n_8888_0565_ca
  */
 
 static void
 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
-                                      pixman_op_t op,
-                                     pixman_image_t * src_image,
-                                     pixman_image_t * mask_image,
-                                     pixman_image_t * dst_image,
-                                     int32_t      src_x,
-                                     int32_t      src_y,
-                                     int32_t      mask_x,
-                                     int32_t      mask_y,
-                                     int32_t      dest_x,
-                                     int32_t      dest_y,
-                                     int32_t     width,
-                                     int32_t     height)
-{
-    uint32_t   src;
-    uint16_t   *dst_line, *dst, d;
-    uint32_t   *mask_line, *mask, m;
-    int        dst_stride, mask_stride;
+                                    pixman_op_t              op,
+                                    pixman_image_t *         src_image,
+                                    pixman_image_t *         mask_image,
+                                    pixman_image_t *         dst_image,
+                                    int32_t                  src_x,
+                                    int32_t                  src_y,
+                                    int32_t                  mask_x,
+                                    int32_t                  mask_y,
+                                    int32_t                  dest_x,
+                                    int32_t                  dest_y,
+                                    int32_t                  width,
+                                    int32_t                  height)
+{
+    uint32_t src;
+    uint16_t    *dst_line, *dst, d;
+    uint32_t    *mask_line, *mask, m;
+    int dst_stride, mask_stride;
     int w;
     uint32_t pack_cmp;
 
@@ -4036,13 +4543,15 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 
     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
 
-    src = _pixman_image_get_solid(src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     if (src == 0)
-        return;
+       return;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
 
     xmm_src = expand_pixel_32_1x128 (src);
     xmm_alpha = expand_alpha_1x128 (xmm_src);
@@ -4051,510 +4560,559 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 
     while (height--)
     {
-        w = width;
-        mask = mask_line;
-        dst = dst_line;
-        mask_line += mask_stride;
-        dst_line += dst_stride;
-
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)mask);
-        cache_prefetch ((__m128i*)dst);
-
-        while (w && ((unsigned long)dst & 15))
-        {
-            m = *(uint32_t *) mask;
-
-            if (m)
-            {
-                d = *dst;
-                mmx_mask = unpack_32_1x64 (m);
-                mmx_dest = expand565_16_1x64 (d);
-
-                *dst = pack_565_32_16 (pack_1x64_32 (in_over_1x64 (&mmx_src,
-                                                                 &mmx_alpha,
-                                                                 &mmx_mask,
-                                                                 &mmx_dest)));
-            }
-
-            w--;
-            dst++;
-            mask++;
-        }
-
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)mask);
-        cache_prefetch ((__m128i*)dst);
-
-        while (w >= 8)
-        {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i*)mask);
-            cache_prefetch_next ((__m128i*)dst);
-
-            /* First round */
-            xmm_mask = load_128_unaligned((__m128i*)mask);
-            xmm_dst = load_128_aligned((__m128i*)dst);
-
-            pack_cmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128()));
-
-            unpack_565_128_4x128 (xmm_dst, &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
-            unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
-            /* preload next round*/
-            xmm_mask = load_128_unaligned((__m128i*)(mask+4));
-            /* preload next round*/
-
-            if (pack_cmp != 0xffff)
-            {
-                in_over_2x128(&xmm_src, &xmm_src, &xmm_alpha, &xmm_alpha, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst0, &xmm_dst1);
-            }
-
-            /* Second round */
-            pack_cmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128()));
-
-            unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-
-            if (pack_cmp != 0xffff)
-            {
-                in_over_2x128(&xmm_src, &xmm_src, &xmm_alpha, &xmm_alpha, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst2, &xmm_dst3);
-            }
-
-            save_128_aligned ((__m128i*)dst, pack_565_4x128_128 (&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
-
-            w -= 8;
-            dst += 8;
-            mask += 8;
-        }
-
-        while (w)
-        {
-            m = *(uint32_t *) mask;
-
-            if (m)
-            {
-                d = *dst;
-                mmx_mask = unpack_32_1x64 (m);
-                mmx_dest = expand565_16_1x64 (d);
-
-                *dst = pack_565_32_16 (pack_1x64_32 (in_over_1x64 (&mmx_src,
-                                                                 &mmx_alpha,
-                                                                 &mmx_mask,
-                                                                 &mmx_dest)));
-            }
-
-            w--;
-            dst++;
-            mask++;
-        }
+       w = width;
+       mask = mask_line;
+       dst = dst_line;
+       mask_line += mask_stride;
+       dst_line += dst_stride;
+
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)mask);
+       cache_prefetch ((__m128i*)dst);
+
+       while (w && ((unsigned long)dst & 15))
+       {
+           m = *(uint32_t *) mask;
+
+           if (m)
+           {
+               d = *dst;
+               mmx_mask = unpack_32_1x64 (m);
+               mmx_dest = expand565_16_1x64 (d);
+
+               *dst = pack_565_32_16 (
+                   pack_1x64_32 (
+                       in_over_1x64 (
+                           &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+           }
+
+           w--;
+           dst++;
+           mask++;
+       }
+
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)mask);
+       cache_prefetch ((__m128i*)dst);
+
+       while (w >= 8)
+       {
+           /* fill cache line with next memory */
+           cache_prefetch_next ((__m128i*)mask);
+           cache_prefetch_next ((__m128i*)dst);
+
+           /* First round */
+           xmm_mask = load_128_unaligned ((__m128i*)mask);
+           xmm_dst = load_128_aligned ((__m128i*)dst);
+
+           pack_cmp = _mm_movemask_epi8 (
+               _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+           unpack_565_128_4x128 (xmm_dst,
+                                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+           unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+           /* preload next round */
+           xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
+
+           /* preload next round */
+           if (pack_cmp != 0xffff)
+           {
+               in_over_2x128 (&xmm_src, &xmm_src,
+                              &xmm_alpha, &xmm_alpha,
+                              &xmm_mask_lo, &xmm_mask_hi,
+                              &xmm_dst0, &xmm_dst1);
+           }
+
+           /* Second round */
+           pack_cmp = _mm_movemask_epi8 (
+               _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+           unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+           if (pack_cmp != 0xffff)
+           {
+               in_over_2x128 (&xmm_src, &xmm_src,
+                              &xmm_alpha, &xmm_alpha,
+                              &xmm_mask_lo, &xmm_mask_hi,
+                              &xmm_dst2, &xmm_dst3);
+           }
+
+           save_128_aligned (
+               (__m128i*)dst, pack_565_4x128_128 (
+                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+           w -= 8;
+           dst += 8;
+           mask += 8;
+       }
+
+       while (w)
+       {
+           m = *(uint32_t *) mask;
+
+           if (m)
+           {
+               d = *dst;
+               mmx_mask = unpack_32_1x64 (m);
+               mmx_dest = expand565_16_1x64 (d);
+
+               *dst = pack_565_32_16 (
+                   pack_1x64_32 (
+                       in_over_1x64 (
+                           &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+           }
+
+           w--;
+           dst++;
+           mask++;
+       }
     }
 
     _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fast_composite_in_n_8_8
+/* -----------------------------------------------------------------------
+ * composite_in_n_8_8
  */
 
 static void
 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
-                        pixman_op_t op,
-                       pixman_image_t * src_image,
-                       pixman_image_t * mask_image,
-                       pixman_image_t * dst_image,
-                       int32_t      src_x,
-                       int32_t      src_y,
-                       int32_t      mask_x,
-                       int32_t      mask_y,
-                       int32_t      dest_x,
-                       int32_t      dest_y,
-                       int32_t     width,
-                       int32_t     height)
-{
-    uint8_t    *dst_line, *dst;
-    uint8_t    *mask_line, *mask;
-    int        dst_stride, mask_stride;
-    uint16_t   w, d, m;
-    uint32_t   src;
-    uint8_t    sa;
+                         pixman_op_t              op,
+                         pixman_image_t *         src_image,
+                         pixman_image_t *         mask_image,
+                         pixman_image_t *         dst_image,
+                         int32_t                  src_x,
+                         int32_t                  src_y,
+                         int32_t                  mask_x,
+                         int32_t                  mask_y,
+                         int32_t                  dest_x,
+                         int32_t                  dest_y,
+                         int32_t                  width,
+                         int32_t                  height)
+{
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    uint16_t w, d, m;
+    uint32_t src;
+    uint8_t sa;
 
     __m128i xmm_alpha;
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
-    src = _pixman_image_get_solid(src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     sa = src >> 24;
     if (sa == 0)
-        return;
+       return;
 
     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
 
     while (height--)
     {
-        dst = dst_line;
-        dst_line += dst_stride;
-        mask = mask_line;
-        mask_line += mask_stride;
-        w = width;
-
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)mask);
-        cache_prefetch ((__m128i*)dst);
-
-        while (w && ((unsigned long)dst & 15))
-        {
-            m = (uint32_t) *mask++;
-            d = (uint32_t) *dst;
-
-            *dst++ = (uint8_t) pack_1x64_32 (pix_multiply_1x64 (pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
-                                                               unpack_32_1x64 (d)));
-            w--;
-        }
-
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)mask);
-        cache_prefetch ((__m128i*)dst);
-
-        while (w >= 16)
-        {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i*)mask);
-            cache_prefetch_next ((__m128i*)dst);
-
-            xmm_mask = load_128_unaligned((__m128i*)mask);
-            xmm_dst = load_128_aligned((__m128i*)dst);
-
-            unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-            unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
-            pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, &xmm_mask_lo, &xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-            pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
-            save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
-            mask += 16;
-            dst += 16;
-            w -= 16;
-        }
-
-        while (w)
-        {
-            m = (uint32_t) *mask++;
-            d = (uint32_t) *dst;
-
-            *dst++ = (uint8_t) pack_1x64_32 (pix_multiply_1x64 (pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
-                                                               unpack_32_1x64 (d)));
-            w--;
-        }
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
+       w = width;
+
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)mask);
+       cache_prefetch ((__m128i*)dst);
+
+       while (w && ((unsigned long)dst & 15))
+       {
+           m = (uint32_t) *mask++;
+           d = (uint32_t) *dst;
+
+           *dst++ = (uint8_t) pack_1x64_32 (
+               pix_multiply_1x64 (
+                   pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
+                                      unpack_32_1x64 (m)),
+                   unpack_32_1x64 (d)));
+           w--;
+       }
+
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)mask);
+       cache_prefetch ((__m128i*)dst);
+
+       while (w >= 16)
+       {
+           /* fill cache line with next memory */
+           cache_prefetch_next ((__m128i*)mask);
+           cache_prefetch_next ((__m128i*)dst);
+
+           xmm_mask = load_128_unaligned ((__m128i*)mask);
+           xmm_dst = load_128_aligned ((__m128i*)dst);
+
+           unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+           unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+           pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+                               &xmm_mask_lo, &xmm_mask_hi,
+                               &xmm_mask_lo, &xmm_mask_hi);
+
+           pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+                               &xmm_dst_lo, &xmm_dst_hi,
+                               &xmm_dst_lo, &xmm_dst_hi);
+
+           save_128_aligned (
+               (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+           mask += 16;
+           dst += 16;
+           w -= 16;
+       }
+
+       while (w)
+       {
+           m = (uint32_t) *mask++;
+           d = (uint32_t) *dst;
+
+           *dst++ = (uint8_t) pack_1x64_32 (
+               pix_multiply_1x64 (
+                   pix_multiply_1x64 (
+                       _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
+                   unpack_32_1x64 (d)));
+           w--;
+       }
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fast_composite_in_8_8
+/* ---------------------------------------------------------------------------
+ * composite_in_8_8
  */
 
 static void
 sse2_composite_in_8_8 (pixman_implementation_t *imp,
-                      pixman_op_t op,
-                     pixman_image_t * src_image,
-                     pixman_image_t * mask_image,
-                     pixman_image_t * dst_image,
-                     int32_t      src_x,
-                     int32_t      src_y,
-                     int32_t      mask_x,
-                     int32_t      mask_y,
-                     int32_t      dest_x,
-                     int32_t      dest_y,
-                     int32_t     width,
-                     int32_t     height)
-{
-    uint8_t    *dst_line, *dst;
-    uint8_t    *src_line, *src;
-    int        src_stride, dst_stride;
-    uint16_t   w;
-    uint32_t    s, d;
+                       pixman_op_t              op,
+                       pixman_image_t *         src_image,
+                       pixman_image_t *         mask_image,
+                       pixman_image_t *         dst_image,
+                       int32_t                  src_x,
+                       int32_t                  src_y,
+                       int32_t                  mask_x,
+                       int32_t                  mask_y,
+                       int32_t                  dest_x,
+                       int32_t                  dest_y,
+                       int32_t                  width,
+                       int32_t                  height)
+{
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int src_stride, dst_stride;
+    uint16_t w;
+    uint32_t s, d;
 
     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
 
     while (height--)
     {
-        dst = dst_line;
-        dst_line += dst_stride;
-        src = src_line;
-        src_line += src_stride;
-        w = width;
-
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)src);
-        cache_prefetch ((__m128i*)dst);
-
-        while (w && ((unsigned long)dst & 15))
-        {
-            s = (uint32_t) *src++;
-            d = (uint32_t) *dst;
-
-            *dst++ = (uint8_t) pack_1x64_32 (pix_multiply_1x64 (unpack_32_1x64 (s),unpack_32_1x64 (d)));
-            w--;
-        }
-
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)src);
-        cache_prefetch ((__m128i*)dst);
-
-        while (w >= 16)
-        {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i*)src);
-            cache_prefetch_next ((__m128i*)dst);
-
-            xmm_src = load_128_unaligned((__m128i*)src);
-            xmm_dst = load_128_aligned((__m128i*)dst);
-
-            unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
-            unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
-            pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_dst_lo, &xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
-
-            save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
-            src += 16;
-            dst += 16;
-            w -= 16;
-        }
-
-        while (w)
-        {
-            s = (uint32_t) *src++;
-            d = (uint32_t) *dst;
-
-            *dst++ = (uint8_t) pack_1x64_32 (pix_multiply_1x64 (unpack_32_1x64 (s),unpack_32_1x64 (d)));
-            w--;
-        }
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)src);
+       cache_prefetch ((__m128i*)dst);
+
+       while (w && ((unsigned long)dst & 15))
+       {
+           s = (uint32_t) *src++;
+           d = (uint32_t) *dst;
+
+           *dst++ = (uint8_t) pack_1x64_32 (
+               pix_multiply_1x64 (
+                   unpack_32_1x64 (s), unpack_32_1x64 (d)));
+           w--;
+       }
+
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)src);
+       cache_prefetch ((__m128i*)dst);
+
+       while (w >= 16)
+       {
+           /* fill cache line with next memory */
+           cache_prefetch_next ((__m128i*)src);
+           cache_prefetch_next ((__m128i*)dst);
+
+           xmm_src = load_128_unaligned ((__m128i*)src);
+           xmm_dst = load_128_aligned ((__m128i*)dst);
+
+           unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+           unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+           pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+                               &xmm_dst_lo, &xmm_dst_hi,
+                               &xmm_dst_lo, &xmm_dst_hi);
+
+           save_128_aligned (
+               (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+           src += 16;
+           dst += 16;
+           w -= 16;
+       }
+
+       while (w)
+       {
+           s = (uint32_t) *src++;
+           d = (uint32_t) *dst;
+
+           *dst++ = (uint8_t) pack_1x64_32 (
+               pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
+           w--;
+       }
     }
 
     _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fast_composite_add_8888_8_8
+/* -------------------------------------------------------------------------
+ * composite_add_8888_8_8
  */
 
 static void
 sse2_composite_add_8888_8_8 (pixman_implementation_t *imp,
-                               pixman_op_t op,
-                              pixman_image_t * src_image,
-                              pixman_image_t * mask_image,
-                              pixman_image_t * dst_image,
-                              int32_t      src_x,
-                              int32_t      src_y,
-                              int32_t      mask_x,
-                              int32_t      mask_y,
-                              int32_t      dest_x,
-                              int32_t      dest_y,
-                              int32_t     width,
-                              int32_t     height)
-{
-    uint8_t    *dst_line, *dst;
-    uint8_t    *mask_line, *mask;
-    int        dst_stride, mask_stride;
-    uint16_t   w;
-    uint32_t   src;
-    uint8_t    sa;
+                             pixman_op_t              op,
+                             pixman_image_t *         src_image,
+                             pixman_image_t *         mask_image,
+                             pixman_image_t *         dst_image,
+                             int32_t                  src_x,
+                             int32_t                  src_y,
+                             int32_t                  mask_x,
+                             int32_t                  mask_y,
+                             int32_t                  dest_x,
+                             int32_t                  dest_y,
+                             int32_t                  width,
+                             int32_t                  height)
+{
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    uint16_t w;
+    uint32_t src;
+    uint8_t sa;
     uint32_t m, d;
 
     __m128i xmm_alpha;
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
-    src = _pixman_image_get_solid(src_image, dst_image->bits.format);
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     sa = src >> 24;
     if (sa == 0)
-        return;
+       return;
 
     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
 
     while (height--)
     {
-        dst = dst_line;
-        dst_line += dst_stride;
-        mask = mask_line;
-        mask_line += mask_stride;
-        w = width;
-
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)mask);
-        cache_prefetch ((__m128i*)dst);
-
-        while (w && ((unsigned long)dst & 15))
-        {
-            m = (uint32_t) *mask++;
-            d = (uint32_t) *dst;
-
-            *dst++ = (uint8_t) pack_1x64_32 (_mm_adds_pu16 (pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
-                                                                              unpack_32_1x64 (d)));
-            w--;
-        }
-
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)mask);
-        cache_prefetch ((__m128i*)dst);
-
-        while (w >= 16)
-        {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i*)mask);
-            cache_prefetch_next ((__m128i*)dst);
-
-            xmm_mask = load_128_unaligned((__m128i*)mask);
-            xmm_dst = load_128_aligned((__m128i*)dst);
-
-            unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-            unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
-            pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, &xmm_mask_lo, &xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
-            xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
-            xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
-
-            save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-
-            mask += 16;
-            dst += 16;
-            w -= 16;
-        }
-
-        while (w)
-        {
-            m = (uint32_t) *mask++;
-            d = (uint32_t) *dst;
-
-            *dst++ = (uint8_t) pack_1x64_32 (_mm_adds_pu16 (pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
-                                                                              unpack_32_1x64 (d)));
-            w--;
-        }
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
+       w = width;
+
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)mask);
+       cache_prefetch ((__m128i*)dst);
+
+       while (w && ((unsigned long)dst & 15))
+       {
+           m = (uint32_t) *mask++;
+           d = (uint32_t) *dst;
+
+           *dst++ = (uint8_t) pack_1x64_32 (
+               _mm_adds_pu16 (
+                   pix_multiply_1x64 (
+                       _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
+                   unpack_32_1x64 (d)));
+           w--;
+       }
+
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)mask);
+       cache_prefetch ((__m128i*)dst);
+
+       while (w >= 16)
+       {
+           /* fill cache line with next memory */
+           cache_prefetch_next ((__m128i*)mask);
+           cache_prefetch_next ((__m128i*)dst);
+
+           xmm_mask = load_128_unaligned ((__m128i*)mask);
+           xmm_dst = load_128_aligned ((__m128i*)dst);
+
+           unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+           unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+           pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+                               &xmm_mask_lo, &xmm_mask_hi,
+                               &xmm_mask_lo, &xmm_mask_hi);
+
+           xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
+           xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
+
+           save_128_aligned (
+               (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+           mask += 16;
+           dst += 16;
+           w -= 16;
+       }
+
+       while (w)
+       {
+           m = (uint32_t) *mask++;
+           d = (uint32_t) *dst;
+
+           *dst++ = (uint8_t) pack_1x64_32 (
+               _mm_adds_pu16 (
+                   pix_multiply_1x64 (
+                       _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
+                   unpack_32_1x64 (d)));
+
+           w--;
+       }
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fast_composite_add_8000_8000
+/* ----------------------------------------------------------------------
+ * composite_add_8000_8000
  */
 
 static void
 sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
-                                pixman_op_t op,
-                               pixman_image_t * src_image,
-                               pixman_image_t * mask_image,
-                               pixman_image_t * dst_image,
-                               int32_t      src_x,
-                               int32_t      src_y,
-                               int32_t      mask_x,
-                               int32_t      mask_y,
-                               int32_t      dest_x,
-                               int32_t      dest_y,
-                               int32_t     width,
-                               int32_t     height)
-{
-    uint8_t    *dst_line, *dst;
-    uint8_t    *src_line, *src;
-    int        dst_stride, src_stride;
-    uint16_t   w;
-    uint16_t   t;
-
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
+{
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int dst_stride, src_stride;
+    uint16_t w;
+    uint16_t t;
+
+    PIXMAN_IMAGE_GET_LINE (
+       src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
 
     while (height--)
     {
-        dst = dst_line;
-        src = src_line;
-
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)src);
-        cache_prefetch ((__m128i*)dst);
-
-        dst_line += dst_stride;
-        src_line += src_stride;
-        w = width;
-
-        /* Small head */
-        while (w && (unsigned long)dst & 3)
-        {
-            t = (*dst) + (*src++);
-            *dst++ = t | (0 - (t >> 8));
-            w--;
-        }
-
-        core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
-
-        /* Small tail */
-        dst += w & 0xfffc;
-        src += w & 0xfffc;
-
-        w &= 3;
-
-        while (w)
-        {
-            t = (*dst) + (*src++);
-            *dst++ = t | (0 - (t >> 8));
-            w--;
-        }
+       dst = dst_line;
+       src = src_line;
+
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)src);
+       cache_prefetch ((__m128i*)dst);
+
+       dst_line += dst_stride;
+       src_line += src_stride;
+       w = width;
+
+       /* Small head */
+       while (w && (unsigned long)dst & 3)
+       {
+           t = (*dst) + (*src++);
+           *dst++ = t | (0 - (t >> 8));
+           w--;
+       }
+
+       core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
+
+       /* Small tail */
+       dst += w & 0xfffc;
+       src += w & 0xfffc;
+
+       w &= 3;
+
+       while (w)
+       {
+           t = (*dst) + (*src++);
+           *dst++ = t | (0 - (t >> 8));
+           w--;
+       }
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fast_composite_add_8888_8888
+/* ---------------------------------------------------------------------
+ * composite_add_8888_8888
  */
 static void
 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
-                                pixman_op_t    op,
-                               pixman_image_t *        src_image,
-                               pixman_image_t *        mask_image,
-                               pixman_image_t *         dst_image,
-                               int32_t          src_x,
-                               int32_t      src_y,
-                               int32_t      mask_x,
-                               int32_t      mask_y,
-                               int32_t      dest_x,
-                               int32_t      dest_y,
-                               int32_t     width,
-                               int32_t     height)
-{
-    uint32_t   *dst_line, *dst;
-    uint32_t   *src_line, *src;
-    int        dst_stride, src_stride;
-
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
+{
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+
+    PIXMAN_IMAGE_GET_LINE (
+       src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 
     while (height--)
     {
-        dst = dst_line;
-        dst_line += dst_stride;
-        src = src_line;
-        src_line += src_stride;
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
 
-        core_combine_add_u_sse2 (dst, src, NULL, width);
+       core_combine_add_u_sse2 (dst, src, NULL, width);
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
 /* -------------------------------------------------------------------------------------------------
@@ -4563,45 +5121,48 @@ sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
 
 static pixman_bool_t
 pixman_blt_sse2 (uint32_t *src_bits,
-              uint32_t *dst_bits,
-              int src_stride,
-              int dst_stride,
-              int src_bpp,
-              int dst_bpp,
-              int src_x, int src_y,
-              int dst_x, int dst_y,
-              int width, int height)
-{
-    uint8_t *  src_bytes;
-    uint8_t *  dst_bytes;
-    int                byte_width;
+                 uint32_t *dst_bits,
+                 int       src_stride,
+                 int       dst_stride,
+                 int       src_bpp,
+                 int       dst_bpp,
+                 int       src_x,
+                 int       src_y,
+                 int       dst_x,
+                 int       dst_y,
+                 int       width,
+                 int       height)
+{
+    uint8_t *   src_bytes;
+    uint8_t *   dst_bytes;
+    int byte_width;
 
     if (src_bpp != dst_bpp)
-        return FALSE;
+       return FALSE;
 
     if (src_bpp == 16)
     {
-        src_stride = src_stride * (int) sizeof (uint32_t) / 2;
-        dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
-        src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
-        dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
-        byte_width = 2 * width;
-        src_stride *= 2;
-        dst_stride *= 2;
+       src_stride = src_stride * (int) sizeof (uint32_t) / 2;
+       dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
+       src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
+       dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
+       byte_width = 2 * width;
+       src_stride *= 2;
+       dst_stride *= 2;
     }
     else if (src_bpp == 32)
     {
-        src_stride = src_stride * (int) sizeof (uint32_t) / 4;
-        dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
-        src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
-        dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
-        byte_width = 4 * width;
-        src_stride *= 4;
-        dst_stride *= 4;
+       src_stride = src_stride * (int) sizeof (uint32_t) / 4;
+       dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
+       src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
+       dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
+       byte_width = 4 * width;
+       src_stride *= 4;
+       dst_stride *= 4;
     }
     else
     {
-        return FALSE;
+       return FALSE;
     }
 
     cache_prefetch ((__m128i*)src_bytes);
@@ -4609,345 +5170,357 @@ pixman_blt_sse2 (uint32_t *src_bits,
 
     while (height--)
     {
-        int w;
-        uint8_t *s = src_bytes;
-        uint8_t *d = dst_bytes;
-        src_bytes += src_stride;
-        dst_bytes += dst_stride;
-        w = byte_width;
-
-        cache_prefetch_next ((__m128i*)s);
-        cache_prefetch_next ((__m128i*)d);
-
-        while (w >= 2 && ((unsigned long)d & 3))
-        {
-            *(uint16_t *)d = *(uint16_t *)s;
-            w -= 2;
-            s += 2;
-            d += 2;
-        }
-
-        while (w >= 4 && ((unsigned long)d & 15))
-        {
-            *(uint32_t *)d = *(uint32_t *)s;
-
-            w -= 4;
-            s += 4;
-            d += 4;
-        }
-
-        cache_prefetch_next ((__m128i*)s);
-        cache_prefetch_next ((__m128i*)d);
-
-        while (w >= 64)
-        {
-            __m128i xmm0, xmm1, xmm2, xmm3;
-
-            /* 128 bytes ahead */
-            cache_prefetch (((__m128i*)s) + 8);
-            cache_prefetch (((__m128i*)d) + 8);
-
-            xmm0 = load_128_unaligned ((__m128i*)(s));
-            xmm1 = load_128_unaligned ((__m128i*)(s+16));
-            xmm2 = load_128_unaligned ((__m128i*)(s+32));
-            xmm3 = load_128_unaligned ((__m128i*)(s+48));
-
-            save_128_aligned ((__m128i*)(d),    xmm0);
-            save_128_aligned ((__m128i*)(d+16), xmm1);
-            save_128_aligned ((__m128i*)(d+32), xmm2);
-            save_128_aligned ((__m128i*)(d+48), xmm3);
-
-            s += 64;
-            d += 64;
-            w -= 64;
-        }
-
-        cache_prefetch_next ((__m128i*)s);
-        cache_prefetch_next ((__m128i*)d);
-
-        while (w >= 16)
-        {
-            save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
-
-            w -= 16;
-            d += 16;
-            s += 16;
-        }
-
-        cache_prefetch_next ((__m128i*)s);
-        cache_prefetch_next ((__m128i*)d);
-
-        while (w >= 4)
-        {
-            *(uint32_t *)d = *(uint32_t *)s;
-
-            w -= 4;
-            s += 4;
-            d += 4;
-        }
-
-        if (w >= 2)
-        {
-            *(uint16_t *)d = *(uint16_t *)s;
-            w -= 2;
-            s += 2;
-            d += 2;
-        }
+       int w;
+       uint8_t *s = src_bytes;
+       uint8_t *d = dst_bytes;
+       src_bytes += src_stride;
+       dst_bytes += dst_stride;
+       w = byte_width;
+
+       cache_prefetch_next ((__m128i*)s);
+       cache_prefetch_next ((__m128i*)d);
+
+       while (w >= 2 && ((unsigned long)d & 3))
+       {
+           *(uint16_t *)d = *(uint16_t *)s;
+           w -= 2;
+           s += 2;
+           d += 2;
+       }
+
+       while (w >= 4 && ((unsigned long)d & 15))
+       {
+           *(uint32_t *)d = *(uint32_t *)s;
+
+           w -= 4;
+           s += 4;
+           d += 4;
+       }
+
+       cache_prefetch_next ((__m128i*)s);
+       cache_prefetch_next ((__m128i*)d);
+
+       while (w >= 64)
+       {
+           __m128i xmm0, xmm1, xmm2, xmm3;
+
+           /* 128 bytes ahead */
+           cache_prefetch (((__m128i*)s) + 8);
+           cache_prefetch (((__m128i*)d) + 8);
+
+           xmm0 = load_128_unaligned ((__m128i*)(s));
+           xmm1 = load_128_unaligned ((__m128i*)(s + 16));
+           xmm2 = load_128_unaligned ((__m128i*)(s + 32));
+           xmm3 = load_128_unaligned ((__m128i*)(s + 48));
+
+           save_128_aligned ((__m128i*)(d),    xmm0);
+           save_128_aligned ((__m128i*)(d + 16), xmm1);
+           save_128_aligned ((__m128i*)(d + 32), xmm2);
+           save_128_aligned ((__m128i*)(d + 48), xmm3);
+
+           s += 64;
+           d += 64;
+           w -= 64;
+       }
+
+       cache_prefetch_next ((__m128i*)s);
+       cache_prefetch_next ((__m128i*)d);
+
+       while (w >= 16)
+       {
+           save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
+
+           w -= 16;
+           d += 16;
+           s += 16;
+       }
+
+       cache_prefetch_next ((__m128i*)s);
+       cache_prefetch_next ((__m128i*)d);
+
+       while (w >= 4)
+       {
+           *(uint32_t *)d = *(uint32_t *)s;
+
+           w -= 4;
+           s += 4;
+           d += 4;
+       }
+
+       if (w >= 2)
+       {
+           *(uint16_t *)d = *(uint16_t *)s;
+           w -= 2;
+           s += 2;
+           d += 2;
+       }
     }
 
-    _mm_empty();
+    _mm_empty ();
 
     return TRUE;
 }
 
 static void
 sse2_composite_copy_area (pixman_implementation_t *imp,
-                        pixman_op_t       op,
-                       pixman_image_t *        src_image,
-                       pixman_image_t *        mask_image,
-                       pixman_image_t *        dst_image,
-                       int32_t         src_x,
-                       int32_t         src_y,
-                       int32_t         mask_x,
-                       int32_t         mask_y,
-                       int32_t         dest_x,
-                       int32_t         dest_y,
-                       int32_t         width,
-                       int32_t         height)
+                          pixman_op_t              op,
+                          pixman_image_t *         src_image,
+                          pixman_image_t *         mask_image,
+                          pixman_image_t *         dst_image,
+                          int32_t                  src_x,
+                          int32_t                  src_y,
+                          int32_t                  mask_x,
+                          int32_t                  mask_y,
+                          int32_t                  dest_x,
+                          int32_t                  dest_y,
+                          int32_t                  width,
+                          int32_t                  height)
 {
     pixman_blt_sse2 (src_image->bits.bits,
-                   dst_image->bits.bits,
-                   src_image->bits.rowstride,
-                   dst_image->bits.rowstride,
-                   PIXMAN_FORMAT_BPP (src_image->bits.format),
-                   PIXMAN_FORMAT_BPP (dst_image->bits.format),
-                   src_x, src_y, dest_x, dest_y, width, height);
+                     dst_image->bits.bits,
+                     src_image->bits.rowstride,
+                     dst_image->bits.rowstride,
+                     PIXMAN_FORMAT_BPP (src_image->bits.format),
+                     PIXMAN_FORMAT_BPP (dst_image->bits.format),
+                     src_x, src_y, dest_x, dest_y, width, height);
 }
 
 #if 0
 /* This code are buggy in MMX version, now the bug was translated to SSE2 version */
 void
 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
-                                pixman_op_t      op,
-                               pixman_image_t * src_image,
-                               pixman_image_t * mask_image,
-                               pixman_image_t * dst_image,
-                               int32_t      src_x,
-                               int32_t      src_y,
-                               int32_t      mask_x,
-                               int32_t      mask_y,
-                               int32_t      dest_x,
-                               int32_t      dest_y,
-                               int32_t     width,
-                               int32_t     height)
-{
-    uint32_t   *src, *src_line, s;
+                                 pixman_op_t              op,
+                                 pixman_image_t *         src_image,
+                                 pixman_image_t *         mask_image,
+                                 pixman_image_t *         dst_image,
+                                 int32_t                  src_x,
+                                 int32_t                  src_y,
+                                 int32_t                  mask_x,
+                                 int32_t                  mask_y,
+                                 int32_t                  dest_x,
+                                 int32_t                  dest_y,
+                                 int32_t                  width,
+                                 int32_t                  height)
+{
+    uint32_t    *src, *src_line, s;
     uint32_t    *dst, *dst_line, d;
-    uint8_t        *mask, *mask_line;
-    uint32_t    m;
-    int                 src_stride, mask_stride, dst_stride;
+    uint8_t         *mask, *mask_line;
+    uint32_t m;
+    int src_stride, mask_stride, dst_stride;
     uint16_t w;
 
     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
     while (height--)
     {
-        src = src_line;
-        src_line += src_stride;
-        dst = dst_line;
-        dst_line += dst_stride;
-        mask = mask_line;
-        mask_line += mask_stride;
-
-        w = width;
-
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)src);
-        cache_prefetch ((__m128i*)dst);
-        cache_prefetch ((__m128i*)mask);
-
-        while (w && (unsigned long)dst & 15)
-        {
-            s = 0xff000000 | *src++;
-            m = (uint32_t) *mask++;
-            d = *dst;
-
-            __m64 ms = unpack_32_1x64 (s);
-
-            if (m != 0xff)
-            {
-                ms = in_over_1x64 (ms,
-                                  mask_x00ff,
-                                  expand_alpha_rev_1x64 (unpack_32_1x64 (m)),
-                                  unpack_32_1x64 (d));
-            }
-
-            *dst++ = pack_1x64_32 (ms);
-            w--;
-        }
-
-        /* call prefetch hint to optimize cache load*/
-        cache_prefetch ((__m128i*)src);
-        cache_prefetch ((__m128i*)dst);
-        cache_prefetch ((__m128i*)mask);
-
-        while (w >= 4)
-        {
-            /* fill cache line with next memory */
-            cache_prefetch_next ((__m128i*)src);
-            cache_prefetch_next ((__m128i*)dst);
-            cache_prefetch_next ((__m128i*)mask);
-
-            m = *(uint32_t*) mask;
-            xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
-
-            if (m == 0xffffffff)
-            {
-                save_128_aligned ((__m128i*)dst, xmm_src);
-            }
-            else
-            {
-                xmm_dst = load_128_aligned ((__m128i*)dst);
-
-                xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
-
-                unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
-                unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-                unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
-                expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
-                in_over_2x128 (xmm_src_lo, xmm_src_hi, mask_00ff, mask_00ff, xmm_mask_lo, xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
-
-                save_128_aligned( (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-            }
-
-            src += 4;
-            dst += 4;
-            mask += 4;
-            w -= 4;
-        }
-
-        while (w)
-        {
-            m = (uint32_t) *mask++;
-
-            if (m)
-            {
-                s = 0xff000000 | *src;
-
-                if (m == 0xff)
-                {
-                    *dst = s;
-                }
-                else
-                {
-                    d = *dst;
-
-                    *dst = pack_1x64_32 (in_over_1x64 (unpack_32_1x64 (s),
-                                                      mask_x00ff,
-                                                      expand_alpha_rev_1x64 (unpack_32_1x64 (m)),
-                                                      unpack_32_1x64 (d)));
-                }
-
-            }
-
-            src++;
-            dst++;
-            w--;
-        }
+       src = src_line;
+       src_line += src_stride;
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
+
+       w = width;
+
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)src);
+       cache_prefetch ((__m128i*)dst);
+       cache_prefetch ((__m128i*)mask);
+
+       while (w && (unsigned long)dst & 15)
+       {
+           s = 0xff000000 | *src++;
+           m = (uint32_t) *mask++;
+           d = *dst;
+
+           __m64 ms = unpack_32_1x64 (s);
+
+           if (m != 0xff)
+           {
+               ms = in_over_1x64 (ms,
+                                  mask_x00ff,
+                                  expand_alpha_rev_1x64 (unpack_32_1x64 (m)),
+                                  unpack_32_1x64 (d));
+           }
+
+           *dst++ = pack_1x64_32 (ms);
+           w--;
+       }
+
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)src);
+       cache_prefetch ((__m128i*)dst);
+       cache_prefetch ((__m128i*)mask);
+
+       while (w >= 4)
+       {
+           /* fill cache line with next memory */
+           cache_prefetch_next ((__m128i*)src);
+           cache_prefetch_next ((__m128i*)dst);
+           cache_prefetch_next ((__m128i*)mask);
+
+           m = *(uint32_t*) mask;
+           xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
+
+           if (m == 0xffffffff)
+           {
+               save_128_aligned ((__m128i*)dst, xmm_src);
+           }
+           else
+           {
+               xmm_dst = load_128_aligned ((__m128i*)dst);
+
+               xmm_mask = _mm_unpacklo_epi16 (
+                   unpack_32_1x128 (m), _mm_setzero_si128 ());
+
+               unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+               unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+               unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+               expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+                                       &xmm_mask_lo, &xmm_mask_hi);
+
+               in_over_2x128 (xmm_src_lo, xmm_src_hi,
+                              mask_00ff, mask_00ff,
+                              xmm_mask_lo, xmm_mask_hi,
+                              &xmm_dst_lo, &xmm_dst_hi);
+
+               save_128_aligned (
+                   (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+           }
+
+           src += 4;
+           dst += 4;
+           mask += 4;
+           w -= 4;
+       }
+
+       while (w)
+       {
+           m = (uint32_t) *mask++;
+
+           if (m)
+           {
+               s = 0xff000000 | *src;
+
+               if (m == 0xff)
+               {
+                   *dst = s;
+               }
+               else
+               {
+                   d = *dst;
+
+                   *dst = pack_1x64_32 (
+                       in_over_1x64 (
+                           unpack_32_1x64 (s),
+                           mask_x00ff,
+                           expand_alpha_rev_1x64 (unpack_32_1x64 (m)),
+                           unpack_32_1x64 (d)));
+               }
+
+           }
+
+           src++;
+           dst++;
+           w--;
+       }
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
+
 #endif
 
 static const pixman_fast_path_t sse2_fast_paths[] =
 {
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   sse2_composite_over_n_8_0565,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   sse2_composite_over_n_8_0565,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_over_n_8888,           0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_over_n_8888,           0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_r5g6b5,   sse2_composite_over_n_0565,           0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_over_8888_8888,          0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_over_8888_8888,          0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, sse2_composite_over_8888_8888,          0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_over_8888_8888,          0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   sse2_composite_over_8888_0565,          0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   sse2_composite_over_8888_0565,          0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_n_8_8888,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_n_8_8888,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_n_8_8888,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_n_8_8888,     0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   sse2_composite_over_n_8_0565,       0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   sse2_composite_over_n_8_0565,       0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_over_n_8888,         0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_over_n_8888,         0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_r5g6b5,   sse2_composite_over_n_0565,         0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_over_8888_8888,      0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_over_8888_8888,      0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, sse2_composite_over_8888_8888,      0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_over_8888_8888,      0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   sse2_composite_over_8888_0565,      0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   sse2_composite_over_8888_0565,      0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_n_8_8888,       0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_n_8_8888,       0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_n_8_8888,       0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_n_8_8888,       0 },
 #if 0
     /* FIXME: This code are buggy in MMX version, now the bug was translated to SSE2 version */
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_x888_8_8888,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_x888_8_8888,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888,       0 },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_x888_8_8888,    0 },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888,    0 },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_x888_8_8888,    0 },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888,    0 },
 #endif
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_x888_n_8888,        NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_x888_n_8888,        NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_x888_n_8888,        NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_x888_n_8888,        NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_8888_n_8888,        NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_8888_n_8888,        NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_8888_n_8888,        NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_8888_n_8888,        NEED_SOLID_MASK },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
+    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
+    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_r5g6b5,   sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_b5g6r5,   sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888,     NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888,     NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888,     NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888,     NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888,     NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888,     NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888,     NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888,     NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5,   sse2_composite_over_pixbuf_0565,     NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5,   sse2_composite_over_pixbuf_0565,     NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5,   sse2_composite_over_pixbuf_0565,     NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5,   sse2_composite_over_pixbuf_0565,     NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_copy_area,               0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_copy_area,               0 },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_copy_area,           0 },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_copy_area,           0 },
 
     { PIXMAN_OP_ADD,  PIXMAN_a8,       PIXMAN_null,     PIXMAN_a8,       sse2_composite_add_8000_8000,       0 },
     { PIXMAN_OP_ADD,  PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_add_8888_8888,       0 },
     { PIXMAN_OP_ADD,  PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, sse2_composite_add_8888_8888,       0 },
     { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8,       sse2_composite_add_8888_8_8,        0 },
 
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_src_n_8_8888,  0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_src_n_8_8888,  0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_src_n_8_8888,  0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_src_n_8_8888,  0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_copy_area,               0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_a8b8g8r8, sse2_composite_copy_area,               0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_copy_area,             0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,    PIXMAN_x8b8g8r8, sse2_composite_copy_area,              0 },
-    { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_copy_area,               0 },
-    { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8,  PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_copy_area,               0 },
-    { PIXMAN_OP_SRC, PIXMAN_r5g6b5,    PIXMAN_null,     PIXMAN_r5g6b5,   sse2_composite_copy_area,               0 },
-    { PIXMAN_OP_SRC, PIXMAN_b5g6r5,    PIXMAN_null,     PIXMAN_b5g6r5,   sse2_composite_copy_area,               0 },
-
-    { PIXMAN_OP_IN,  PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       sse2_composite_in_8_8,                 0 },
-    { PIXMAN_OP_IN,  PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8,       sse2_composite_in_n_8_8,               0 },
+    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_src_n_8_8888,        0 },
+    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_src_n_8_8888,        0 },
+    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_src_n_8_8888,        0 },
+    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_src_n_8_8888,        0 },
+    { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_copy_area,           0 },
+    { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_a8b8g8r8, sse2_composite_copy_area,           0 },
+    { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_copy_area,           0 },
+    { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_copy_area,           0 },
+    { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_copy_area,           0 },
+    { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8,  PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_copy_area,           0 },
+    { PIXMAN_OP_SRC, PIXMAN_r5g6b5,    PIXMAN_null,     PIXMAN_r5g6b5,   sse2_composite_copy_area,           0 },
+    { PIXMAN_OP_SRC, PIXMAN_b5g6r5,    PIXMAN_null,     PIXMAN_b5g6r5,   sse2_composite_copy_area,           0 },
+
+    { PIXMAN_OP_IN,  PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       sse2_composite_in_8_8,              0 },
+    { PIXMAN_OP_IN,  PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8,       sse2_composite_in_n_8_8,            0 },
 
     { PIXMAN_OP_NONE },
 };
 
 /*
  * Work around GCC bug causing crashes in Mozilla with SSE2
- * 
+ *
  * When using -msse, gcc generates movdqa instructions assuming that
  * the stack is 16 byte aligned. Unfortunately some applications, such
  * as Mozilla and Mono, end up aligning the stack to 4 bytes, which
@@ -4966,35 +5539,35 @@ __attribute__((__force_align_arg_pointer__))
 #endif
 static void
 sse2_composite (pixman_implementation_t *imp,
-               pixman_op_t     op,
-               pixman_image_t *src,
-               pixman_image_t *mask,
-               pixman_image_t *dest,
-               int32_t         src_x,
-               int32_t         src_y,
-               int32_t         mask_x,
-               int32_t         mask_y,
-               int32_t         dest_x,
-               int32_t         dest_y,
-               int32_t        width,
-               int32_t        height)
+                pixman_op_t              op,
+                pixman_image_t *         src,
+                pixman_image_t *         mask,
+                pixman_image_t *         dest,
+                int32_t                  src_x,
+                int32_t                  src_y,
+                int32_t                  mask_x,
+                int32_t                  mask_y,
+                int32_t                  dest_x,
+                int32_t                  dest_y,
+                int32_t                  width,
+                int32_t                  height)
 {
     if (_pixman_run_fast_path (sse2_fast_paths, imp,
-                              op, src, mask, dest,
-                              src_x, src_y,
-                              mask_x, mask_y,
-                              dest_x, dest_y,
-                              width, height))
+                               op, src, mask, dest,
+                               src_x, src_y,
+                               mask_x, mask_y,
+                               dest_x, dest_y,
+                               width, height))
     {
        return;
     }
 
     _pixman_implementation_composite (imp->delegate, op,
-                                     src, mask, dest,
-                                     src_x, src_y,
-                                     mask_x, mask_y,
-                                     dest_x, dest_y,
-                                     width, height);
+                                      src, mask, dest,
+                                      src_x, src_y,
+                                      mask_x, mask_y,
+                                      dest_x, dest_y,
+                                      width, height);
 }
 
 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
@@ -5002,19 +5575,22 @@ __attribute__((__force_align_arg_pointer__))
 #endif
 static pixman_bool_t
 sse2_blt (pixman_implementation_t *imp,
-         uint32_t *src_bits,
-         uint32_t *dst_bits,
-         int src_stride,
-         int dst_stride,
-         int src_bpp,
-         int dst_bpp,
-         int src_x, int src_y,
-         int dst_x, int dst_y,
-         int width, int height)
+          uint32_t *               src_bits,
+          uint32_t *               dst_bits,
+          int                      src_stride,
+          int                      dst_stride,
+          int                      src_bpp,
+          int                      dst_bpp,
+          int                      src_x,
+          int                      src_y,
+          int                      dst_x,
+          int                      dst_y,
+          int                      width,
+          int                      height)
 {
     if (!pixman_blt_sse2 (
-           src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
-           src_x, src_y, dst_x, dst_y, width, height))
+            src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+            src_x, src_y, dst_x, dst_y, width, height))
 
     {
        return _pixman_implementation_blt (
@@ -5031,14 +5607,14 @@ __attribute__((__force_align_arg_pointer__))
 #endif
 static pixman_bool_t
 sse2_fill (pixman_implementation_t *imp,
-          uint32_t *bits,
-          int stride,
-          int bpp,
-          int x,
-          int y,
-          int width,
-          int height,
-          uint32_t xor)
+           uint32_t *               bits,
+           int                      stride,
+           int                      bpp,
+           int                      x,
+           int                      y,
+           int                      width,
+           int                      height,
+           uint32_t xor)
 {
     if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
     {
@@ -5071,20 +5647,20 @@ _pixman_implementation_create_sse2 (void)
     mask_ffff = create_mask_16_128 (0xffff);
     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
-    
+
     /* MMX constants */
     mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
     mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
-    
+
     mask_x0080 = create_mask_16_64 (0x0080);
     mask_x00ff = create_mask_16_64 (0x00ff);
     mask_x0101 = create_mask_16_64 (0x0101);
     mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
 
-    _mm_empty();
+    _mm_empty ();
 
     /* Set up function pointers */
-    
+
     /* SSE code patch for fbcompose.c */
     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
@@ -5096,9 +5672,9 @@ _pixman_implementation_create_sse2 (void)
     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
     imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
     imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
-    
+
     imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
-    
+
     imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
     imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
@@ -5110,11 +5686,11 @@ _pixman_implementation_create_sse2 (void)
     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
     imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
     imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
-    
+
     imp->composite = sse2_composite;
     imp->blt = sse2_blt;
     imp->fill = sse2_fill;
-    
+
     return imp;
 }