[sse2] Bit-reversing typo: src != dst
[profile/ivi/pixman.git] / pixman / pixman-sse2.c
1 /*
2  * Copyright © 2008 Rodrigo Kumpera
3  * Copyright © 2008 André Tupinambá
4  *
5  * Permission to use, copy, modify, distribute, and sell this software and its
6  * documentation for any purpose is hereby granted without fee, provided that
7  * the above copyright notice appear in all copies and that both that
8  * copyright notice and this permission notice appear in supporting
9  * documentation, and that the name of Red Hat not be used in advertising or
10  * publicity pertaining to distribution of the software without specific,
11  * written prior permission.  Red Hat makes no representations about the
12  * suitability of this software for any purpose.  It is provided "as is"
13  * without express or implied warranty.
14  *
15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22  * SOFTWARE.
23  *
24  * Author:  Rodrigo Kumpera (kumpera@gmail.com)
25  *          André Tupinambá (andrelrt@gmail.com)
26  *
27  * Based on work by Owen Taylor and Søren Sandmann
28  */
29 #ifdef HAVE_CONFIG_H
30 #include <config.h>
31 #endif
32
33 #include <mmintrin.h>
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
37 #include "pixman-combine32.h"
38
39 #if defined(_MSC_VER) && defined(_M_AMD64)
40 /* Windows 64 doesn't allow MMX to be used, so
41  * the pixman-x64-mmx-emulation.h file contains
42  * implementations of those MMX intrinsics that
43  * are used in the SSE2 implementation.
44  */
45 #   include "pixman-x64-mmx-emulation.h"
46 #endif
47
48 #ifdef USE_SSE2
49
50 /* --------------------------------------------------------------------
51  * Locals
52  */
53
54 static __m64 mask_x0080;
55 static __m64 mask_x00ff;
56 static __m64 mask_x0101;
57 static __m64 mask_x_alpha;
58
59 static __m64 mask_x565_rgb;
60 static __m64 mask_x565_unpack;
61
62 static __m128i mask_0080;
63 static __m128i mask_00ff;
64 static __m128i mask_0101;
65 static __m128i mask_ffff;
66 static __m128i mask_ff000000;
67 static __m128i mask_alpha;
68
69 static __m128i mask_565_r;
70 static __m128i mask_565_g1, mask_565_g2;
71 static __m128i mask_565_b;
72 static __m128i mask_red;
73 static __m128i mask_green;
74 static __m128i mask_blue;
75
76 static __m128i mask_565_fix_rb;
77 static __m128i mask_565_fix_g;
78
79 /* ----------------------------------------------------------------------
80  * SSE2 Inlines
81  */
82 static force_inline __m128i
83 unpack_32_1x128 (uint32_t data)
84 {
85     return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
86 }
87
88 static force_inline void
89 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
90 {
91     *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
92     *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
93 }
94
95 static force_inline __m128i
96 unpack_565_to_8888 (__m128i lo)
97 {
98     __m128i r, g, b, rb, t;
99
100     r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
101     g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
102     b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
103
104     rb = _mm_or_si128 (r, b);
105     t  = _mm_and_si128 (rb, mask_565_fix_rb);
106     t  = _mm_srli_epi32 (t, 5);
107     rb = _mm_or_si128 (rb, t);
108
109     t  = _mm_and_si128 (g, mask_565_fix_g);
110     t  = _mm_srli_epi32 (t, 6);
111     g  = _mm_or_si128 (g, t);
112
113     return _mm_or_si128 (rb, g);
114 }
115
116 static force_inline void
117 unpack_565_128_4x128 (__m128i  data,
118                       __m128i* data0,
119                       __m128i* data1,
120                       __m128i* data2,
121                       __m128i* data3)
122 {
123     __m128i lo, hi;
124
125     lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
126     hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
127
128     lo = unpack_565_to_8888 (lo);
129     hi = unpack_565_to_8888 (hi);
130
131     unpack_128_2x128 (lo, data0, data1);
132     unpack_128_2x128 (hi, data2, data3);
133 }
134
135 static force_inline uint16_t
136 pack_565_32_16 (uint32_t pixel)
137 {
138     return (uint16_t) (((pixel >> 8) & 0xf800) |
139                        ((pixel >> 5) & 0x07e0) |
140                        ((pixel >> 3) & 0x001f));
141 }
142
143 static force_inline __m128i
144 pack_2x128_128 (__m128i lo, __m128i hi)
145 {
146     return _mm_packus_epi16 (lo, hi);
147 }
148
149 static force_inline __m128i
150 pack_565_2x128_128 (__m128i lo, __m128i hi)
151 {
152     __m128i data;
153     __m128i r, g1, g2, b;
154
155     data = pack_2x128_128 (lo, hi);
156
157     r  = _mm_and_si128 (data, mask_565_r);
158     g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
159     g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
160     b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
161
162     return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
163 }
164
165 static force_inline __m128i
166 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
167 {
168     return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
169                              pack_565_2x128_128 (*xmm2, *xmm3));
170 }
171
172 static force_inline int
173 is_opaque (__m128i x)
174 {
175     __m128i ffs = _mm_cmpeq_epi8 (x, x);
176
177     return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
178 }
179
180 static force_inline int
181 is_zero (__m128i x)
182 {
183     return _mm_movemask_epi8 (
184         _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
185 }
186
187 static force_inline int
188 is_transparent (__m128i x)
189 {
190     return (_mm_movemask_epi8 (
191                 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
192 }
193
194 static force_inline __m128i
195 expand_pixel_32_1x128 (uint32_t data)
196 {
197     return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
198 }
199
200 static force_inline __m128i
201 expand_alpha_1x128 (__m128i data)
202 {
203     return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
204                                                      _MM_SHUFFLE (3, 3, 3, 3)),
205                                 _MM_SHUFFLE (3, 3, 3, 3));
206 }
207
208 static force_inline void
209 expand_alpha_2x128 (__m128i  data_lo,
210                     __m128i  data_hi,
211                     __m128i* alpha_lo,
212                     __m128i* alpha_hi)
213 {
214     __m128i lo, hi;
215
216     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
217     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
218
219     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
220     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
221 }
222
223 static force_inline void
224 expand_alpha_rev_2x128 (__m128i  data_lo,
225                         __m128i  data_hi,
226                         __m128i* alpha_lo,
227                         __m128i* alpha_hi)
228 {
229     __m128i lo, hi;
230
231     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
232     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
233     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
234     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
235 }
236
237 static force_inline void
238 pix_multiply_2x128 (__m128i* data_lo,
239                     __m128i* data_hi,
240                     __m128i* alpha_lo,
241                     __m128i* alpha_hi,
242                     __m128i* ret_lo,
243                     __m128i* ret_hi)
244 {
245     __m128i lo, hi;
246
247     lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
248     hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
249     lo = _mm_adds_epu16 (lo, mask_0080);
250     hi = _mm_adds_epu16 (hi, mask_0080);
251     *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
252     *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
253 }
254
255 static force_inline void
256 pix_add_multiply_2x128 (__m128i* src_lo,
257                         __m128i* src_hi,
258                         __m128i* alpha_dst_lo,
259                         __m128i* alpha_dst_hi,
260                         __m128i* dst_lo,
261                         __m128i* dst_hi,
262                         __m128i* alpha_src_lo,
263                         __m128i* alpha_src_hi,
264                         __m128i* ret_lo,
265                         __m128i* ret_hi)
266 {
267     __m128i t1_lo, t1_hi;
268     __m128i t2_lo, t2_hi;
269
270     pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
271     pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
272
273     *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
274     *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
275 }
276
277 static force_inline void
278 negate_2x128 (__m128i  data_lo,
279               __m128i  data_hi,
280               __m128i* neg_lo,
281               __m128i* neg_hi)
282 {
283     *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
284     *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
285 }
286
287 static force_inline void
288 invert_colors_2x128 (__m128i  data_lo,
289                      __m128i  data_hi,
290                      __m128i* inv_lo,
291                      __m128i* inv_hi)
292 {
293     __m128i lo, hi;
294
295     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
296     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
297     *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
298     *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
299 }
300
301 static force_inline void
302 over_2x128 (__m128i* src_lo,
303             __m128i* src_hi,
304             __m128i* alpha_lo,
305             __m128i* alpha_hi,
306             __m128i* dst_lo,
307             __m128i* dst_hi)
308 {
309     __m128i t1, t2;
310
311     negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
312
313     pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
314
315     *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
316     *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
317 }
318
319 static force_inline void
320 over_rev_non_pre_2x128 (__m128i  src_lo,
321                         __m128i  src_hi,
322                         __m128i* dst_lo,
323                         __m128i* dst_hi)
324 {
325     __m128i lo, hi;
326     __m128i alpha_lo, alpha_hi;
327
328     expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
329
330     lo = _mm_or_si128 (alpha_lo, mask_alpha);
331     hi = _mm_or_si128 (alpha_hi, mask_alpha);
332
333     invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
334
335     pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
336
337     over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
338 }
339
340 static force_inline void
341 in_over_2x128 (__m128i* src_lo,
342                __m128i* src_hi,
343                __m128i* alpha_lo,
344                __m128i* alpha_hi,
345                __m128i* mask_lo,
346                __m128i* mask_hi,
347                __m128i* dst_lo,
348                __m128i* dst_hi)
349 {
350     __m128i s_lo, s_hi;
351     __m128i a_lo, a_hi;
352
353     pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
354     pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
355
356     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
357 }
358
359 static force_inline void
360 cache_prefetch (__m128i* addr)
361 {
362     _mm_prefetch ((void const*)addr, _MM_HINT_T0);
363 }
364
365 static force_inline void
366 cache_prefetch_next (__m128i* addr)
367 {
368     _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */
369 }
370
371 /* load 4 pixels from a 16-byte boundary aligned address */
372 static force_inline __m128i
373 load_128_aligned (__m128i* src)
374 {
375     return _mm_load_si128 (src);
376 }
377
378 /* load 4 pixels from a unaligned address */
379 static force_inline __m128i
380 load_128_unaligned (const __m128i* src)
381 {
382     return _mm_loadu_si128 (src);
383 }
384
385 /* save 4 pixels using Write Combining memory on a 16-byte
386  * boundary aligned address
387  */
388 static force_inline void
389 save_128_write_combining (__m128i* dst,
390                           __m128i  data)
391 {
392     _mm_stream_si128 (dst, data);
393 }
394
395 /* save 4 pixels on a 16-byte boundary aligned address */
396 static force_inline void
397 save_128_aligned (__m128i* dst,
398                   __m128i  data)
399 {
400     _mm_store_si128 (dst, data);
401 }
402
403 /* save 4 pixels on a unaligned address */
404 static force_inline void
405 save_128_unaligned (__m128i* dst,
406                     __m128i  data)
407 {
408     _mm_storeu_si128 (dst, data);
409 }
410
411 /* ------------------------------------------------------------------
412  * MMX inlines
413  */
414
415 static force_inline __m64
416 unpack_32_1x64 (uint32_t data)
417 {
418     return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64 ());
419 }
420
421 static force_inline __m64
422 expand_alpha_1x64 (__m64 data)
423 {
424     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
425 }
426
427 static force_inline __m64
428 expand_alpha_rev_1x64 (__m64 data)
429 {
430     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
431 }
432
433 static force_inline __m64
434 expand_pixel_8_1x64 (uint8_t data)
435 {
436     return _mm_shuffle_pi16 (
437         unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
438 }
439
440 static force_inline __m64
441 pix_multiply_1x64 (__m64 data,
442                    __m64 alpha)
443 {
444     return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
445                                           mask_x0080),
446                            mask_x0101);
447 }
448
449 static force_inline __m64
450 pix_add_multiply_1x64 (__m64* src,
451                        __m64* alpha_dst,
452                        __m64* dst,
453                        __m64* alpha_src)
454 {
455     __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
456     __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
457
458     return _mm_adds_pu8 (t1, t2);
459 }
460
461 static force_inline __m64
462 negate_1x64 (__m64 data)
463 {
464     return _mm_xor_si64 (data, mask_x00ff);
465 }
466
467 static force_inline __m64
468 invert_colors_1x64 (__m64 data)
469 {
470     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
471 }
472
473 static force_inline __m64
474 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
475 {
476     return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
477 }
478
479 static force_inline __m64
480 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
481 {
482     return over_1x64 (pix_multiply_1x64 (*src, *mask),
483                       pix_multiply_1x64 (*alpha, *mask),
484                       *dst);
485 }
486
487 static force_inline __m64
488 over_rev_non_pre_1x64 (__m64 src, __m64 dst)
489 {
490     __m64 alpha = expand_alpha_1x64 (src);
491
492     return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
493                                          _mm_or_si64 (alpha, mask_x_alpha)),
494                       alpha,
495                       dst);
496 }
497
498 static force_inline uint32_t
499 pack_1x64_32 (__m64 data)
500 {
501     return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
502 }
503
504 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
505  *
506  *    00RR00GG00BB
507  *
508  * --- Expanding 565 in the low word ---
509  *
510  * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
511  * m = m & (01f0003f001f);
512  * m = m * (008404100840);
513  * m = m >> 8;
514  *
515  * Note the trick here - the top word is shifted by another nibble to
516  * avoid it bumping into the middle word
517  */
518 static force_inline __m64
519 expand565_16_1x64 (uint16_t pixel)
520 {
521     __m64 p;
522     __m64 t1, t2;
523
524     p = _mm_cvtsi32_si64 ((uint32_t) pixel);
525
526     t1 = _mm_slli_si64 (p, 36 - 11);
527     t2 = _mm_slli_si64 (p, 16 - 5);
528
529     p = _mm_or_si64 (t1, p);
530     p = _mm_or_si64 (t2, p);
531     p = _mm_and_si64 (p, mask_x565_rgb);
532     p = _mm_mullo_pi16 (p, mask_x565_unpack);
533
534     return _mm_srli_pi16 (p, 8);
535 }
536
537 /* ----------------------------------------------------------------------------
538  * Compose Core transformations
539  */
540 static force_inline uint32_t
541 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
542 {
543     uint8_t a;
544     __m64 ms;
545
546     a = src >> 24;
547
548     if (a == 0xff)
549     {
550         return src;
551     }
552     else if (src)
553     {
554         ms = unpack_32_1x64 (src);
555         return pack_1x64_32 (
556             over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
557     }
558
559     return dst;
560 }
561
562 static force_inline uint32_t
563 combine1 (const uint32_t *ps, const uint32_t *pm)
564 {
565     uint32_t s = *ps;
566
567     if (pm)
568     {
569         __m64 ms, mm;
570
571         mm = unpack_32_1x64 (*pm);
572         mm = expand_alpha_1x64 (mm);
573
574         ms = unpack_32_1x64 (s);
575         ms = pix_multiply_1x64 (ms, mm);
576
577         s = pack_1x64_32 (ms);
578     }
579
580     return s;
581 }
582
583 static force_inline __m128i
584 combine4 (const __m128i *ps, const __m128i *pm)
585 {
586     __m128i xmm_src_lo, xmm_src_hi;
587     __m128i xmm_msk_lo, xmm_msk_hi;
588     __m128i s;
589
590     if (pm)
591     {
592         xmm_msk_lo = load_128_unaligned (pm);
593
594         if (is_transparent (xmm_msk_lo))
595             return _mm_setzero_si128 ();
596     }
597
598     s = load_128_unaligned (ps);
599
600     if (pm)
601     {
602         unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
603         unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
604
605         expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
606
607         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
608                             &xmm_msk_lo, &xmm_msk_hi,
609                             &xmm_src_lo, &xmm_src_hi);
610
611         s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
612     }
613
614     return s;
615 }
616
617 static force_inline void
618 core_combine_over_u_sse2 (uint32_t*       pd,
619                           const uint32_t* ps,
620                           const uint32_t* pm,
621                           int             w)
622 {
623     uint32_t s, d;
624
625     __m128i xmm_dst_lo, xmm_dst_hi;
626     __m128i xmm_src_lo, xmm_src_hi;
627     __m128i xmm_alpha_lo, xmm_alpha_hi;
628
629     /* call prefetch hint to optimize cache load*/
630     cache_prefetch ((__m128i*)ps);
631     cache_prefetch ((__m128i*)pd);
632     cache_prefetch ((__m128i*)pm);
633
634     /* Align dst on a 16-byte boundary */
635     while (w && ((unsigned long)pd & 15))
636     {
637         d = *pd;
638         s = combine1 (ps, pm);
639
640         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
641         ps++;
642         if (pm)
643             pm++;
644         w--;
645     }
646
647     /* call prefetch hint to optimize cache load*/
648     cache_prefetch ((__m128i*)ps);
649     cache_prefetch ((__m128i*)pd);
650     cache_prefetch ((__m128i*)pm);
651
652     while (w >= 4)
653     {
654         /* fill cache line with next memory */
655         cache_prefetch_next ((__m128i*)ps);
656         cache_prefetch_next ((__m128i*)pd);
657         cache_prefetch_next ((__m128i*)pm);
658
659         /* I'm loading unaligned because I'm not sure about
660          * the address alignment.
661          */
662         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
663
664         if (is_opaque (xmm_src_hi))
665         {
666             save_128_aligned ((__m128i*)pd, xmm_src_hi);
667         }
668         else if (!is_zero (xmm_src_hi))
669         {
670             xmm_dst_hi = load_128_aligned ((__m128i*) pd);
671
672             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
673             unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
674
675             expand_alpha_2x128 (
676                 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
677
678             over_2x128 (&xmm_src_lo, &xmm_src_hi,
679                         &xmm_alpha_lo, &xmm_alpha_hi,
680                         &xmm_dst_lo, &xmm_dst_hi);
681
682             /* rebuid the 4 pixel data and save*/
683             save_128_aligned ((__m128i*)pd,
684                               pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
685         }
686
687         w -= 4;
688         ps += 4;
689         pd += 4;
690         if (pm)
691             pm += 4;
692     }
693
694     while (w)
695     {
696         d = *pd;
697         s = combine1 (ps, pm);
698
699         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
700         ps++;
701         if (pm)
702             pm++;
703
704         w--;
705     }
706 }
707
708 static force_inline void
709 core_combine_over_reverse_u_sse2 (uint32_t*       pd,
710                                   const uint32_t* ps,
711                                   const uint32_t* pm,
712                                   int             w)
713 {
714     uint32_t s, d;
715
716     __m128i xmm_dst_lo, xmm_dst_hi;
717     __m128i xmm_src_lo, xmm_src_hi;
718     __m128i xmm_alpha_lo, xmm_alpha_hi;
719
720     /* call prefetch hint to optimize cache load*/
721     cache_prefetch ((__m128i*)ps);
722     cache_prefetch ((__m128i*)pd);
723     cache_prefetch ((__m128i*)pm);
724
725     /* Align dst on a 16-byte boundary */
726     while (w &&
727            ((unsigned long)pd & 15))
728     {
729         d = *pd;
730         s = combine1 (ps, pm);
731
732         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
733         w--;
734         ps++;
735         if (pm)
736             pm++;
737     }
738
739     /* call prefetch hint to optimize cache load*/
740     cache_prefetch ((__m128i*)ps);
741     cache_prefetch ((__m128i*)pd);
742     cache_prefetch ((__m128i*)pm);
743
744     while (w >= 4)
745     {
746         /* fill cache line with next memory */
747         cache_prefetch_next ((__m128i*)ps);
748         cache_prefetch_next ((__m128i*)pd);
749         cache_prefetch_next ((__m128i*)pm);
750
751         /* I'm loading unaligned because I'm not sure
752          * about the address alignment.
753          */
754         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
755         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
756
757         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
758         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
759
760         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
761                             &xmm_alpha_lo, &xmm_alpha_hi);
762
763         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
764                     &xmm_alpha_lo, &xmm_alpha_hi,
765                     &xmm_src_lo, &xmm_src_hi);
766
767         /* rebuid the 4 pixel data and save*/
768         save_128_aligned ((__m128i*)pd,
769                           pack_2x128_128 (xmm_src_lo, xmm_src_hi));
770
771         w -= 4;
772         ps += 4;
773         pd += 4;
774
775         if (pm)
776             pm += 4;
777     }
778
779     while (w)
780     {
781         d = *pd;
782         s = combine1 (ps, pm);
783
784         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
785         ps++;
786         w--;
787         if (pm)
788             pm++;
789     }
790 }
791
792 static force_inline uint32_t
793 core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
794 {
795     uint32_t maska = src >> 24;
796
797     if (maska == 0)
798     {
799         return 0;
800     }
801     else if (maska != 0xff)
802     {
803         return pack_1x64_32 (
804             pix_multiply_1x64 (unpack_32_1x64 (dst),
805                                expand_alpha_1x64 (unpack_32_1x64 (src))));
806     }
807
808     return dst;
809 }
810
811 static force_inline void
812 core_combine_in_u_sse2 (uint32_t*       pd,
813                         const uint32_t* ps,
814                         const uint32_t* pm,
815                         int             w)
816 {
817     uint32_t s, d;
818
819     __m128i xmm_src_lo, xmm_src_hi;
820     __m128i xmm_dst_lo, xmm_dst_hi;
821
822     /* call prefetch hint to optimize cache load*/
823     cache_prefetch ((__m128i*)ps);
824     cache_prefetch ((__m128i*)pd);
825     cache_prefetch ((__m128i*)pm);
826
827     while (w && ((unsigned long) pd & 15))
828     {
829         s = combine1 (ps, pm);
830         d = *pd;
831
832         *pd++ = core_combine_in_u_pixelsse2 (d, s);
833         w--;
834         ps++;
835         if (pm)
836             pm++;
837     }
838
839     /* call prefetch hint to optimize cache load*/
840     cache_prefetch ((__m128i*)ps);
841     cache_prefetch ((__m128i*)pd);
842     cache_prefetch ((__m128i*)pm);
843
844     while (w >= 4)
845     {
846         /* fill cache line with next memory */
847         cache_prefetch_next ((__m128i*)ps);
848         cache_prefetch_next ((__m128i*)pd);
849         cache_prefetch_next ((__m128i*)pm);
850
851         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
852         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
853
854         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
855         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
856
857         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
858         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
859                             &xmm_dst_lo, &xmm_dst_hi,
860                             &xmm_dst_lo, &xmm_dst_hi);
861
862         save_128_aligned ((__m128i*)pd,
863                           pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
864
865         ps += 4;
866         pd += 4;
867         w -= 4;
868         if (pm)
869             pm += 4;
870     }
871
872     while (w)
873     {
874         s = combine1 (ps, pm);
875         d = *pd;
876
877         *pd++ = core_combine_in_u_pixelsse2 (d, s);
878         w--;
879         ps++;
880         if (pm)
881             pm++;
882     }
883 }
884
885 static force_inline void
886 core_combine_reverse_in_u_sse2 (uint32_t*       pd,
887                                 const uint32_t* ps,
888                                 const uint32_t *pm,
889                                 int             w)
890 {
891     uint32_t s, d;
892
893     __m128i xmm_src_lo, xmm_src_hi;
894     __m128i xmm_dst_lo, xmm_dst_hi;
895
896     /* call prefetch hint to optimize cache load*/
897     cache_prefetch ((__m128i*)ps);
898     cache_prefetch ((__m128i*)pd);
899     cache_prefetch ((__m128i*)pm);
900
901     while (w && ((unsigned long) pd & 15))
902     {
903         s = combine1 (ps, pm);
904         d = *pd;
905
906         *pd++ = core_combine_in_u_pixelsse2 (s, d);
907         ps++;
908         w--;
909         if (pm)
910             pm++;
911     }
912
913     /* call prefetch hint to optimize cache load*/
914     cache_prefetch ((__m128i*)ps);
915     cache_prefetch ((__m128i*)pd);
916     cache_prefetch ((__m128i*)pm);
917
918     while (w >= 4)
919     {
920         /* fill cache line with next memory */
921         cache_prefetch_next ((__m128i*)ps);
922         cache_prefetch_next ((__m128i*)pd);
923         cache_prefetch_next ((__m128i*)pm);
924
925         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
926         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
927
928         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
929         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
930
931         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
932         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
933                             &xmm_src_lo, &xmm_src_hi,
934                             &xmm_dst_lo, &xmm_dst_hi);
935
936         save_128_aligned (
937             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
938
939         ps += 4;
940         pd += 4;
941         w -= 4;
942         if (pm)
943             pm += 4;
944     }
945
946     while (w)
947     {
948         s = combine1 (ps, pm);
949         d = *pd;
950
951         *pd++ = core_combine_in_u_pixelsse2 (s, d);
952         w--;
953         ps++;
954         if (pm)
955             pm++;
956     }
957 }
958
959 static force_inline void
960 core_combine_reverse_out_u_sse2 (uint32_t*       pd,
961                                  const uint32_t* ps,
962                                  const uint32_t* pm,
963                                  int             w)
964 {
965     /* call prefetch hint to optimize cache load*/
966     cache_prefetch ((__m128i*)ps);
967     cache_prefetch ((__m128i*)pd);
968     cache_prefetch ((__m128i*)pm);
969
970     while (w && ((unsigned long) pd & 15))
971     {
972         uint32_t s = combine1 (ps, pm);
973         uint32_t d = *pd;
974
975         *pd++ = pack_1x64_32 (
976             pix_multiply_1x64 (
977                 unpack_32_1x64 (d), negate_1x64 (
978                     expand_alpha_1x64 (unpack_32_1x64 (s)))));
979         
980         if (pm)
981             pm++;
982         ps++;
983         w--;
984     }
985
986     /* call prefetch hint to optimize cache load*/
987     cache_prefetch ((__m128i*)ps);
988     cache_prefetch ((__m128i*)pd);
989     cache_prefetch ((__m128i*)pm);
990
991     while (w >= 4)
992     {
993         __m128i xmm_src_lo, xmm_src_hi;
994         __m128i xmm_dst_lo, xmm_dst_hi;
995
996         /* fill cache line with next memory */
997         cache_prefetch_next ((__m128i*)ps);
998         cache_prefetch_next ((__m128i*)pd);
999         cache_prefetch_next ((__m128i*)pm);
1000
1001         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1002         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1003
1004         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1005         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1006
1007         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1008         negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1009
1010         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1011                             &xmm_src_lo, &xmm_src_hi,
1012                             &xmm_dst_lo, &xmm_dst_hi);
1013
1014         save_128_aligned (
1015             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1016
1017         ps += 4;
1018         pd += 4;
1019         if (pm)
1020             pm += 4;
1021
1022         w -= 4;
1023     }
1024
1025     while (w)
1026     {
1027         uint32_t s = combine1 (ps, pm);
1028         uint32_t d = *pd;
1029
1030         *pd++ = pack_1x64_32 (
1031             pix_multiply_1x64 (
1032                 unpack_32_1x64 (d), negate_1x64 (
1033                     expand_alpha_1x64 (unpack_32_1x64 (s)))));
1034         ps++;
1035         if (pm)
1036             pm++;
1037         w--;
1038     }
1039 }
1040
1041 static force_inline void
1042 core_combine_out_u_sse2 (uint32_t*       pd,
1043                          const uint32_t* ps,
1044                          const uint32_t* pm,
1045                          int             w)
1046 {
1047     /* call prefetch hint to optimize cache load*/
1048     cache_prefetch ((__m128i*)ps);
1049     cache_prefetch ((__m128i*)pd);
1050     cache_prefetch ((__m128i*)pm);
1051
1052     while (w && ((unsigned long) pd & 15))
1053     {
1054         uint32_t s = combine1 (ps, pm);
1055         uint32_t d = *pd;
1056
1057         *pd++ = pack_1x64_32 (
1058             pix_multiply_1x64 (
1059                 unpack_32_1x64 (s), negate_1x64 (
1060                     expand_alpha_1x64 (unpack_32_1x64 (d)))));
1061         w--;
1062         ps++;
1063         if (pm)
1064             pm++;
1065     }
1066
1067     /* call prefetch hint to optimize cache load*/
1068     cache_prefetch ((__m128i*)ps);
1069     cache_prefetch ((__m128i*)pd);
1070     cache_prefetch ((__m128i*)pm);
1071
1072     while (w >= 4)
1073     {
1074         __m128i xmm_src_lo, xmm_src_hi;
1075         __m128i xmm_dst_lo, xmm_dst_hi;
1076
1077         /* fill cache line with next memory */
1078         cache_prefetch_next ((__m128i*)ps);
1079         cache_prefetch_next ((__m128i*)pd);
1080         cache_prefetch_next ((__m128i*)pm);
1081
1082         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1083         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1084
1085         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1086         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1087
1088         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1089         negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1090
1091         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1092                             &xmm_dst_lo, &xmm_dst_hi,
1093                             &xmm_dst_lo, &xmm_dst_hi);
1094
1095         save_128_aligned (
1096             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1097
1098         ps += 4;
1099         pd += 4;
1100         w -= 4;
1101         if (pm)
1102             pm += 4;
1103     }
1104
1105     while (w)
1106     {
1107         uint32_t s = combine1 (ps, pm);
1108         uint32_t d = *pd;
1109
1110         *pd++ = pack_1x64_32 (
1111             pix_multiply_1x64 (
1112                 unpack_32_1x64 (s), negate_1x64 (
1113                     expand_alpha_1x64 (unpack_32_1x64 (d)))));
1114         w--;
1115         ps++;
1116         if (pm)
1117             pm++;
1118     }
1119 }
1120
1121 static force_inline uint32_t
1122 core_combine_atop_u_pixel_sse2 (uint32_t src,
1123                                 uint32_t dst)
1124 {
1125     __m64 s = unpack_32_1x64 (src);
1126     __m64 d = unpack_32_1x64 (dst);
1127
1128     __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
1129     __m64 da = expand_alpha_1x64 (d);
1130
1131     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1132 }
1133
1134 static force_inline void
1135 core_combine_atop_u_sse2 (uint32_t*       pd,
1136                           const uint32_t* ps,
1137                           const uint32_t* pm,
1138                           int             w)
1139 {
1140     uint32_t s, d;
1141
1142     __m128i xmm_src_lo, xmm_src_hi;
1143     __m128i xmm_dst_lo, xmm_dst_hi;
1144     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1145     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1146
1147     /* call prefetch hint to optimize cache load*/
1148     cache_prefetch ((__m128i*)ps);
1149     cache_prefetch ((__m128i*)pd);
1150     cache_prefetch ((__m128i*)pm);
1151
1152     while (w && ((unsigned long) pd & 15))
1153     {
1154         s = combine1 (ps, pm);
1155         d = *pd;
1156
1157         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1158         w--;
1159         ps++;
1160         if (pm)
1161             pm++;
1162     }
1163
1164     /* call prefetch hint to optimize cache load*/
1165     cache_prefetch ((__m128i*)ps);
1166     cache_prefetch ((__m128i*)pd);
1167     cache_prefetch ((__m128i*)pm);
1168
1169     while (w >= 4)
1170     {
1171         /* fill cache line with next memory */
1172         cache_prefetch_next ((__m128i*)ps);
1173         cache_prefetch_next ((__m128i*)pd);
1174         cache_prefetch_next ((__m128i*)pm);
1175
1176         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1177         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1178
1179         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1180         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1181
1182         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1183                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1184         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1185                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1186
1187         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1188                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1189
1190         pix_add_multiply_2x128 (
1191             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1192             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1193             &xmm_dst_lo, &xmm_dst_hi);
1194
1195         save_128_aligned (
1196             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1197
1198         ps += 4;
1199         pd += 4;
1200         w -= 4;
1201         if (pm)
1202             pm += 4;
1203     }
1204
1205     while (w)
1206     {
1207         s = combine1 (ps, pm);
1208         d = *pd;
1209
1210         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1211         w--;
1212         ps++;
1213         if (pm)
1214             pm++;
1215     }
1216 }
1217
1218 static force_inline uint32_t
1219 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1220                                         uint32_t dst)
1221 {
1222     __m64 s = unpack_32_1x64 (src);
1223     __m64 d = unpack_32_1x64 (dst);
1224
1225     __m64 sa = expand_alpha_1x64 (s);
1226     __m64 da = negate_1x64 (expand_alpha_1x64 (d));
1227
1228     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1229 }
1230
1231 static force_inline void
1232 core_combine_reverse_atop_u_sse2 (uint32_t*       pd,
1233                                   const uint32_t* ps,
1234                                   const uint32_t* pm,
1235                                   int             w)
1236 {
1237     uint32_t s, d;
1238
1239     __m128i xmm_src_lo, xmm_src_hi;
1240     __m128i xmm_dst_lo, xmm_dst_hi;
1241     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1242     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1243
1244     /* call prefetch hint to optimize cache load*/
1245     cache_prefetch ((__m128i*)ps);
1246     cache_prefetch ((__m128i*)pd);
1247     cache_prefetch ((__m128i*)pm);
1248
1249     while (w && ((unsigned long) pd & 15))
1250     {
1251         s = combine1 (ps, pm);
1252         d = *pd;
1253
1254         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1255         ps++;
1256         w--;
1257         if (pm)
1258             pm++;
1259     }
1260
1261     /* call prefetch hint to optimize cache load*/
1262     cache_prefetch ((__m128i*)ps);
1263     cache_prefetch ((__m128i*)pd);
1264     cache_prefetch ((__m128i*)pm);
1265
1266     while (w >= 4)
1267     {
1268         /* fill cache line with next memory */
1269         cache_prefetch_next ((__m128i*)ps);
1270         cache_prefetch_next ((__m128i*)pd);
1271         cache_prefetch_next ((__m128i*)pm);
1272
1273         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1274         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1275
1276         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1277         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1278
1279         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1280                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1281         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1282                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1283
1284         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1285                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1286
1287         pix_add_multiply_2x128 (
1288             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1289             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1290             &xmm_dst_lo, &xmm_dst_hi);
1291
1292         save_128_aligned (
1293             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1294
1295         ps += 4;
1296         pd += 4;
1297         w -= 4;
1298         if (pm)
1299             pm += 4;
1300     }
1301
1302     while (w)
1303     {
1304         s = combine1 (ps, pm);
1305         d = *pd;
1306
1307         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1308         ps++;
1309         w--;
1310         if (pm)
1311             pm++;
1312     }
1313 }
1314
1315 static force_inline uint32_t
1316 core_combine_xor_u_pixel_sse2 (uint32_t src,
1317                                uint32_t dst)
1318 {
1319     __m64 s = unpack_32_1x64 (src);
1320     __m64 d = unpack_32_1x64 (dst);
1321
1322     __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
1323     __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
1324
1325     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
1326 }
1327
1328 static force_inline void
1329 core_combine_xor_u_sse2 (uint32_t*       dst,
1330                          const uint32_t* src,
1331                          const uint32_t *mask,
1332                          int             width)
1333 {
1334     int w = width;
1335     uint32_t s, d;
1336     uint32_t* pd = dst;
1337     const uint32_t* ps = src;
1338     const uint32_t* pm = mask;
1339
1340     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1341     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1342     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1343     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1344
1345     /* call prefetch hint to optimize cache load*/
1346     cache_prefetch ((__m128i*)ps);
1347     cache_prefetch ((__m128i*)pd);
1348     cache_prefetch ((__m128i*)pm);
1349
1350     while (w && ((unsigned long) pd & 15))
1351     {
1352         s = combine1 (ps, pm);
1353         d = *pd;
1354
1355         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1356         w--;
1357         ps++;
1358         if (pm)
1359             pm++;
1360     }
1361
1362     /* call prefetch hint to optimize cache load*/
1363     cache_prefetch ((__m128i*)ps);
1364     cache_prefetch ((__m128i*)pd);
1365     cache_prefetch ((__m128i*)pm);
1366
1367     while (w >= 4)
1368     {
1369         /* fill cache line with next memory */
1370         cache_prefetch_next ((__m128i*)ps);
1371         cache_prefetch_next ((__m128i*)pd);
1372         cache_prefetch_next ((__m128i*)pm);
1373
1374         xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1375         xmm_dst = load_128_aligned ((__m128i*) pd);
1376
1377         unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1378         unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1379
1380         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1381                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1382         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1383                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1384
1385         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1386                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1387         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1388                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1389
1390         pix_add_multiply_2x128 (
1391             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1392             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1393             &xmm_dst_lo, &xmm_dst_hi);
1394
1395         save_128_aligned (
1396             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1397
1398         ps += 4;
1399         pd += 4;
1400         w -= 4;
1401         if (pm)
1402             pm += 4;
1403     }
1404
1405     while (w)
1406     {
1407         s = combine1 (ps, pm);
1408         d = *pd;
1409
1410         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1411         w--;
1412         ps++;
1413         if (pm)
1414             pm++;
1415     }
1416 }
1417
1418 static force_inline void
1419 core_combine_add_u_sse2 (uint32_t*       dst,
1420                          const uint32_t* src,
1421                          const uint32_t* mask,
1422                          int             width)
1423 {
1424     int w = width;
1425     uint32_t s, d;
1426     uint32_t* pd = dst;
1427     const uint32_t* ps = src;
1428     const uint32_t* pm = mask;
1429
1430     /* call prefetch hint to optimize cache load*/
1431     cache_prefetch ((__m128i*)ps);
1432     cache_prefetch ((__m128i*)pd);
1433     cache_prefetch ((__m128i*)pm);
1434
1435     while (w && (unsigned long)pd & 15)
1436     {
1437         s = combine1 (ps, pm);
1438         d = *pd;
1439
1440         ps++;
1441         if (pm)
1442             pm++;
1443         *pd++ = _mm_cvtsi64_si32 (
1444             _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1445         w--;
1446     }
1447
1448     /* call prefetch hint to optimize cache load*/
1449     cache_prefetch ((__m128i*)ps);
1450     cache_prefetch ((__m128i*)pd);
1451     cache_prefetch ((__m128i*)pm);
1452
1453     while (w >= 4)
1454     {
1455         __m128i s;
1456
1457         /* fill cache line with next memory */
1458         cache_prefetch_next ((__m128i*)ps);
1459         cache_prefetch_next ((__m128i*)pd);
1460         cache_prefetch_next ((__m128i*)pm);
1461
1462         s = combine4 ((__m128i*)ps, (__m128i*)pm);
1463
1464         save_128_aligned (
1465             (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
1466
1467         pd += 4;
1468         ps += 4;
1469         if (pm)
1470             pm += 4;
1471         w -= 4;
1472     }
1473
1474     while (w--)
1475     {
1476         s = combine1 (ps, pm);
1477         d = *pd;
1478
1479         ps++;
1480         *pd++ = _mm_cvtsi64_si32 (
1481             _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1482         if (pm)
1483             pm++;
1484     }
1485 }
1486
1487 static force_inline uint32_t
1488 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1489                                     uint32_t dst)
1490 {
1491     __m64 ms = unpack_32_1x64 (src);
1492     __m64 md = unpack_32_1x64 (dst);
1493     uint32_t sa = src >> 24;
1494     uint32_t da = ~dst >> 24;
1495
1496     if (sa > da)
1497     {
1498         ms = pix_multiply_1x64 (
1499             ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
1500     }
1501
1502     return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1503 }
1504
1505 static force_inline void
1506 core_combine_saturate_u_sse2 (uint32_t *      pd,
1507                               const uint32_t *ps,
1508                               const uint32_t *pm,
1509                               int             w)
1510 {
1511     uint32_t s, d;
1512
1513     uint32_t pack_cmp;
1514     __m128i xmm_src, xmm_dst;
1515
1516     /* call prefetch hint to optimize cache load*/
1517     cache_prefetch ((__m128i*)ps);
1518     cache_prefetch ((__m128i*)pd);
1519     cache_prefetch ((__m128i*)pm);
1520
1521     while (w && (unsigned long)pd & 15)
1522     {
1523         s = combine1 (ps, pm);
1524         d = *pd;
1525
1526         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1527         w--;
1528         ps++;
1529         if (pm)
1530             pm++;
1531     }
1532
1533     /* call prefetch hint to optimize cache load*/
1534     cache_prefetch ((__m128i*)ps);
1535     cache_prefetch ((__m128i*)pd);
1536     cache_prefetch ((__m128i*)pm);
1537
1538     while (w >= 4)
1539     {
1540         /* fill cache line with next memory */
1541         cache_prefetch_next ((__m128i*)ps);
1542         cache_prefetch_next ((__m128i*)pd);
1543         cache_prefetch_next ((__m128i*)pm);
1544
1545         xmm_dst = load_128_aligned  ((__m128i*)pd);
1546         xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1547
1548         pack_cmp = _mm_movemask_epi8 (
1549             _mm_cmpgt_epi32 (
1550                 _mm_srli_epi32 (xmm_src, 24),
1551                 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1552
1553         /* if some alpha src is grater than respective ~alpha dst */
1554         if (pack_cmp)
1555         {
1556             s = combine1 (ps++, pm);
1557             d = *pd;
1558             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1559             if (pm)
1560                 pm++;
1561
1562             s = combine1 (ps++, pm);
1563             d = *pd;
1564             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1565             if (pm)
1566                 pm++;
1567
1568             s = combine1 (ps++, pm);
1569             d = *pd;
1570             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1571             if (pm)
1572                 pm++;
1573
1574             s = combine1 (ps++, pm);
1575             d = *pd;
1576             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1577             if (pm)
1578                 pm++;
1579         }
1580         else
1581         {
1582             save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1583
1584             pd += 4;
1585             ps += 4;
1586             if (pm)
1587                 pm += 4;
1588         }
1589
1590         w -= 4;
1591     }
1592
1593     while (w--)
1594     {
1595         s = combine1 (ps, pm);
1596         d = *pd;
1597
1598         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1599         ps++;
1600         if (pm)
1601             pm++;
1602     }
1603 }
1604
1605 static force_inline void
1606 core_combine_src_ca_sse2 (uint32_t*       pd,
1607                           const uint32_t* ps,
1608                           const uint32_t *pm,
1609                           int             w)
1610 {
1611     uint32_t s, m;
1612
1613     __m128i xmm_src_lo, xmm_src_hi;
1614     __m128i xmm_mask_lo, xmm_mask_hi;
1615     __m128i xmm_dst_lo, xmm_dst_hi;
1616
1617     /* call prefetch hint to optimize cache load*/
1618     cache_prefetch ((__m128i*)ps);
1619     cache_prefetch ((__m128i*)pd);
1620     cache_prefetch ((__m128i*)pm);
1621
1622     while (w && (unsigned long)pd & 15)
1623     {
1624         s = *ps++;
1625         m = *pm++;
1626         *pd++ = pack_1x64_32 (
1627             pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1628         w--;
1629     }
1630
1631     /* call prefetch hint to optimize cache load*/
1632     cache_prefetch ((__m128i*)ps);
1633     cache_prefetch ((__m128i*)pd);
1634     cache_prefetch ((__m128i*)pm);
1635
1636     while (w >= 4)
1637     {
1638         /* fill cache line with next memory */
1639         cache_prefetch_next ((__m128i*)ps);
1640         cache_prefetch_next ((__m128i*)pd);
1641         cache_prefetch_next ((__m128i*)pm);
1642
1643         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1644         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1645
1646         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1647         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1648
1649         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1650                             &xmm_mask_lo, &xmm_mask_hi,
1651                             &xmm_dst_lo, &xmm_dst_hi);
1652
1653         save_128_aligned (
1654             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1655
1656         ps += 4;
1657         pd += 4;
1658         pm += 4;
1659         w -= 4;
1660     }
1661
1662     while (w)
1663     {
1664         s = *ps++;
1665         m = *pm++;
1666         *pd++ = pack_1x64_32 (
1667             pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1668         w--;
1669     }
1670 }
1671
1672 static force_inline uint32_t
1673 core_combine_over_ca_pixel_sse2 (uint32_t src,
1674                                  uint32_t mask,
1675                                  uint32_t dst)
1676 {
1677     __m64 s = unpack_32_1x64 (src);
1678     __m64 expAlpha = expand_alpha_1x64 (s);
1679     __m64 unpk_mask = unpack_32_1x64 (mask);
1680     __m64 unpk_dst  = unpack_32_1x64 (dst);
1681
1682     return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1683 }
1684
1685 static force_inline void
1686 core_combine_over_ca_sse2 (uint32_t*       pd,
1687                            const uint32_t* ps,
1688                            const uint32_t *pm,
1689                            int             w)
1690 {
1691     uint32_t s, m, d;
1692
1693     __m128i xmm_alpha_lo, xmm_alpha_hi;
1694     __m128i xmm_src_lo, xmm_src_hi;
1695     __m128i xmm_dst_lo, xmm_dst_hi;
1696     __m128i xmm_mask_lo, xmm_mask_hi;
1697
1698     /* call prefetch hint to optimize cache load*/
1699     cache_prefetch ((__m128i*)ps);
1700     cache_prefetch ((__m128i*)pd);
1701     cache_prefetch ((__m128i*)pm);
1702
1703     while (w && (unsigned long)pd & 15)
1704     {
1705         s = *ps++;
1706         m = *pm++;
1707         d = *pd;
1708
1709         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1710         w--;
1711     }
1712
1713     /* call prefetch hint to optimize cache load*/
1714     cache_prefetch ((__m128i*)ps);
1715     cache_prefetch ((__m128i*)pd);
1716     cache_prefetch ((__m128i*)pm);
1717
1718     while (w >= 4)
1719     {
1720         /* fill cache line with next memory */
1721         cache_prefetch_next ((__m128i*)ps);
1722         cache_prefetch_next ((__m128i*)pd);
1723         cache_prefetch_next ((__m128i*)pm);
1724
1725         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1726         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1727         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1728
1729         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1730         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1731         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1732
1733         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1734                             &xmm_alpha_lo, &xmm_alpha_hi);
1735
1736         in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1737                        &xmm_alpha_lo, &xmm_alpha_hi,
1738                        &xmm_mask_lo, &xmm_mask_hi,
1739                        &xmm_dst_lo, &xmm_dst_hi);
1740
1741         save_128_aligned (
1742             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1743
1744         ps += 4;
1745         pd += 4;
1746         pm += 4;
1747         w -= 4;
1748     }
1749
1750     while (w)
1751     {
1752         s = *ps++;
1753         m = *pm++;
1754         d = *pd;
1755
1756         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1757         w--;
1758     }
1759 }
1760
1761 static force_inline uint32_t
1762 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1763                                          uint32_t mask,
1764                                          uint32_t dst)
1765 {
1766     __m64 d = unpack_32_1x64 (dst);
1767
1768     return pack_1x64_32 (
1769         over_1x64 (d, expand_alpha_1x64 (d),
1770                    pix_multiply_1x64 (unpack_32_1x64 (src),
1771                                       unpack_32_1x64 (mask))));
1772 }
1773
1774 static force_inline void
1775 core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
1776                                    const uint32_t* ps,
1777                                    const uint32_t *pm,
1778                                    int             w)
1779 {
1780     uint32_t s, m, d;
1781
1782     __m128i xmm_alpha_lo, xmm_alpha_hi;
1783     __m128i xmm_src_lo, xmm_src_hi;
1784     __m128i xmm_dst_lo, xmm_dst_hi;
1785     __m128i xmm_mask_lo, xmm_mask_hi;
1786
1787     /* call prefetch hint to optimize cache load*/
1788     cache_prefetch ((__m128i*)ps);
1789     cache_prefetch ((__m128i*)pd);
1790     cache_prefetch ((__m128i*)pm);
1791
1792     while (w && (unsigned long)pd & 15)
1793     {
1794         s = *ps++;
1795         m = *pm++;
1796         d = *pd;
1797
1798         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1799         w--;
1800     }
1801
1802     /* call prefetch hint to optimize cache load*/
1803     cache_prefetch ((__m128i*)ps);
1804     cache_prefetch ((__m128i*)pd);
1805     cache_prefetch ((__m128i*)pm);
1806
1807     while (w >= 4)
1808     {
1809         /* fill cache line with next memory */
1810         cache_prefetch_next ((__m128i*)ps);
1811         cache_prefetch_next ((__m128i*)pd);
1812         cache_prefetch_next ((__m128i*)pm);
1813
1814         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1815         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1816         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1817
1818         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1819         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1820         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1821
1822         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1823                             &xmm_alpha_lo, &xmm_alpha_hi);
1824         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1825                             &xmm_mask_lo, &xmm_mask_hi,
1826                             &xmm_mask_lo, &xmm_mask_hi);
1827
1828         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1829                     &xmm_alpha_lo, &xmm_alpha_hi,
1830                     &xmm_mask_lo, &xmm_mask_hi);
1831
1832         save_128_aligned (
1833             (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1834
1835         ps += 4;
1836         pd += 4;
1837         pm += 4;
1838         w -= 4;
1839     }
1840
1841     while (w)
1842     {
1843         s = *ps++;
1844         m = *pm++;
1845         d = *pd;
1846
1847         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1848         w--;
1849     }
1850 }
1851
1852 static force_inline void
1853 core_combine_in_ca_sse2 (uint32_t *      pd,
1854                          const uint32_t *ps,
1855                          const uint32_t *pm,
1856                          int             w)
1857 {
1858     uint32_t s, m, d;
1859
1860     __m128i xmm_alpha_lo, xmm_alpha_hi;
1861     __m128i xmm_src_lo, xmm_src_hi;
1862     __m128i xmm_dst_lo, xmm_dst_hi;
1863     __m128i xmm_mask_lo, xmm_mask_hi;
1864
1865     /* call prefetch hint to optimize cache load*/
1866     cache_prefetch ((__m128i*)ps);
1867     cache_prefetch ((__m128i*)pd);
1868     cache_prefetch ((__m128i*)pm);
1869
1870     while (w && (unsigned long)pd & 15)
1871     {
1872         s = *ps++;
1873         m = *pm++;
1874         d = *pd;
1875
1876         *pd++ = pack_1x64_32 (
1877             pix_multiply_1x64 (
1878                 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1879                 expand_alpha_1x64 (unpack_32_1x64 (d))));
1880
1881         w--;
1882     }
1883
1884     /* call prefetch hint to optimize cache load*/
1885     cache_prefetch ((__m128i*)ps);
1886     cache_prefetch ((__m128i*)pd);
1887     cache_prefetch ((__m128i*)pm);
1888
1889     while (w >= 4)
1890     {
1891         /* fill cache line with next memory */
1892         cache_prefetch_next ((__m128i*)ps);
1893         cache_prefetch_next ((__m128i*)pd);
1894         cache_prefetch_next ((__m128i*)pm);
1895
1896         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1897         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1898         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1899
1900         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1901         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1902         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1903
1904         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1905                             &xmm_alpha_lo, &xmm_alpha_hi);
1906
1907         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1908                             &xmm_mask_lo, &xmm_mask_hi,
1909                             &xmm_dst_lo, &xmm_dst_hi);
1910
1911         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1912                             &xmm_alpha_lo, &xmm_alpha_hi,
1913                             &xmm_dst_lo, &xmm_dst_hi);
1914
1915         save_128_aligned (
1916             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1917
1918         ps += 4;
1919         pd += 4;
1920         pm += 4;
1921         w -= 4;
1922     }
1923
1924     while (w)
1925     {
1926         s = *ps++;
1927         m = *pm++;
1928         d = *pd;
1929
1930         *pd++ = pack_1x64_32 (
1931             pix_multiply_1x64 (
1932                 pix_multiply_1x64 (
1933                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
1934                 expand_alpha_1x64 (unpack_32_1x64 (d))));
1935
1936         w--;
1937     }
1938 }
1939
1940 static force_inline void
1941 core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
1942                                  const uint32_t *ps,
1943                                  const uint32_t *pm,
1944                                  int             w)
1945 {
1946     uint32_t s, m, d;
1947
1948     __m128i xmm_alpha_lo, xmm_alpha_hi;
1949     __m128i xmm_src_lo, xmm_src_hi;
1950     __m128i xmm_dst_lo, xmm_dst_hi;
1951     __m128i xmm_mask_lo, xmm_mask_hi;
1952
1953     /* call prefetch hint to optimize cache load*/
1954     cache_prefetch ((__m128i*)ps);
1955     cache_prefetch ((__m128i*)pd);
1956     cache_prefetch ((__m128i*)pm);
1957
1958     while (w && (unsigned long)pd & 15)
1959     {
1960         s = *ps++;
1961         m = *pm++;
1962         d = *pd;
1963
1964         *pd++ = pack_1x64_32 (
1965             pix_multiply_1x64 (
1966                 unpack_32_1x64 (d),
1967                 pix_multiply_1x64 (unpack_32_1x64 (m),
1968                                    expand_alpha_1x64 (unpack_32_1x64 (s)))));
1969         w--;
1970     }
1971
1972     /* call prefetch hint to optimize cache load*/
1973     cache_prefetch ((__m128i*)ps);
1974     cache_prefetch ((__m128i*)pd);
1975     cache_prefetch ((__m128i*)pm);
1976
1977     while (w >= 4)
1978     {
1979         /* fill cache line with next memory */
1980         cache_prefetch_next ((__m128i*)ps);
1981         cache_prefetch_next ((__m128i*)pd);
1982         cache_prefetch_next ((__m128i*)pm);
1983
1984         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1985         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1986         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1987
1988         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1989         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1990         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1991
1992         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1993                             &xmm_alpha_lo, &xmm_alpha_hi);
1994         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1995                             &xmm_alpha_lo, &xmm_alpha_hi,
1996                             &xmm_alpha_lo, &xmm_alpha_hi);
1997
1998         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1999                             &xmm_alpha_lo, &xmm_alpha_hi,
2000                             &xmm_dst_lo, &xmm_dst_hi);
2001
2002         save_128_aligned (
2003             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2004
2005         ps += 4;
2006         pd += 4;
2007         pm += 4;
2008         w -= 4;
2009     }
2010
2011     while (w)
2012     {
2013         s = *ps++;
2014         m = *pm++;
2015         d = *pd;
2016
2017         *pd++ = pack_1x64_32 (
2018             pix_multiply_1x64 (
2019                 unpack_32_1x64 (d),
2020                 pix_multiply_1x64 (unpack_32_1x64 (m),
2021                                    expand_alpha_1x64 (unpack_32_1x64 (s)))));
2022         w--;
2023     }
2024 }
2025
2026 static force_inline void
2027 core_combine_out_ca_sse2 (uint32_t *      pd,
2028                           const uint32_t *ps,
2029                           const uint32_t *pm,
2030                           int             w)
2031 {
2032     uint32_t s, m, d;
2033
2034     __m128i xmm_alpha_lo, xmm_alpha_hi;
2035     __m128i xmm_src_lo, xmm_src_hi;
2036     __m128i xmm_dst_lo, xmm_dst_hi;
2037     __m128i xmm_mask_lo, xmm_mask_hi;
2038
2039     /* call prefetch hint to optimize cache load*/
2040     cache_prefetch ((__m128i*)ps);
2041     cache_prefetch ((__m128i*)pd);
2042     cache_prefetch ((__m128i*)pm);
2043
2044     while (w && (unsigned long)pd & 15)
2045     {
2046         s = *ps++;
2047         m = *pm++;
2048         d = *pd;
2049
2050         *pd++ = pack_1x64_32 (
2051             pix_multiply_1x64 (
2052                 pix_multiply_1x64 (
2053                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
2054                 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2055         w--;
2056     }
2057
2058     /* call prefetch hint to optimize cache load*/
2059     cache_prefetch ((__m128i*)ps);
2060     cache_prefetch ((__m128i*)pd);
2061     cache_prefetch ((__m128i*)pm);
2062
2063     while (w >= 4)
2064     {
2065         /* fill cache line with next memory */
2066         cache_prefetch_next ((__m128i*)ps);
2067         cache_prefetch_next ((__m128i*)pd);
2068         cache_prefetch_next ((__m128i*)pm);
2069
2070         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2071         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2072         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2073
2074         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2075         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2076         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2077
2078         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2079                             &xmm_alpha_lo, &xmm_alpha_hi);
2080         negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
2081                       &xmm_alpha_lo, &xmm_alpha_hi);
2082
2083         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2084                             &xmm_mask_lo, &xmm_mask_hi,
2085                             &xmm_dst_lo, &xmm_dst_hi);
2086         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2087                             &xmm_alpha_lo, &xmm_alpha_hi,
2088                             &xmm_dst_lo, &xmm_dst_hi);
2089
2090         save_128_aligned (
2091             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2092
2093         ps += 4;
2094         pd += 4;
2095         pm += 4;
2096         w -= 4;
2097     }
2098
2099     while (w)
2100     {
2101         s = *ps++;
2102         m = *pm++;
2103         d = *pd;
2104
2105         *pd++ = pack_1x64_32 (
2106             pix_multiply_1x64 (
2107                 pix_multiply_1x64 (
2108                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
2109                 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2110
2111         w--;
2112     }
2113 }
2114
2115 static force_inline void
2116 core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
2117                                   const uint32_t *ps,
2118                                   const uint32_t *pm,
2119                                   int             w)
2120 {
2121     uint32_t s, m, d;
2122
2123     __m128i xmm_alpha_lo, xmm_alpha_hi;
2124     __m128i xmm_src_lo, xmm_src_hi;
2125     __m128i xmm_dst_lo, xmm_dst_hi;
2126     __m128i xmm_mask_lo, xmm_mask_hi;
2127
2128     /* call prefetch hint to optimize cache load*/
2129     cache_prefetch ((__m128i*)ps);
2130     cache_prefetch ((__m128i*)pd);
2131     cache_prefetch ((__m128i*)pm);
2132
2133     while (w && (unsigned long)pd & 15)
2134     {
2135         s = *ps++;
2136         m = *pm++;
2137         d = *pd;
2138
2139         *pd++ = pack_1x64_32 (
2140             pix_multiply_1x64 (
2141                 unpack_32_1x64 (d),
2142                 negate_1x64 (pix_multiply_1x64 (
2143                                  unpack_32_1x64 (m),
2144                                  expand_alpha_1x64 (unpack_32_1x64 (s))))));
2145         w--;
2146     }
2147
2148     /* call prefetch hint to optimize cache load*/
2149     cache_prefetch ((__m128i*)ps);
2150     cache_prefetch ((__m128i*)pd);
2151     cache_prefetch ((__m128i*)pm);
2152
2153     while (w >= 4)
2154     {
2155         /* fill cache line with next memory */
2156         cache_prefetch_next ((__m128i*)ps);
2157         cache_prefetch_next ((__m128i*)pd);
2158         cache_prefetch_next ((__m128i*)pm);
2159
2160         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2161         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2162         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2163
2164         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2165         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2166         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2167
2168         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2169                             &xmm_alpha_lo, &xmm_alpha_hi);
2170
2171         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2172                             &xmm_alpha_lo, &xmm_alpha_hi,
2173                             &xmm_mask_lo, &xmm_mask_hi);
2174
2175         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2176                       &xmm_mask_lo, &xmm_mask_hi);
2177
2178         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2179                             &xmm_mask_lo, &xmm_mask_hi,
2180                             &xmm_dst_lo, &xmm_dst_hi);
2181
2182         save_128_aligned (
2183             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2184
2185         ps += 4;
2186         pd += 4;
2187         pm += 4;
2188         w -= 4;
2189     }
2190
2191     while (w)
2192     {
2193         s = *ps++;
2194         m = *pm++;
2195         d = *pd;
2196
2197         *pd++ = pack_1x64_32 (
2198             pix_multiply_1x64 (
2199                 unpack_32_1x64 (d),
2200                 negate_1x64 (pix_multiply_1x64 (
2201                                  unpack_32_1x64 (m),
2202                                  expand_alpha_1x64 (unpack_32_1x64 (s))))));
2203         w--;
2204     }
2205 }
2206
2207 static force_inline uint32_t
2208 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2209                                  uint32_t mask,
2210                                  uint32_t dst)
2211 {
2212     __m64 m = unpack_32_1x64 (mask);
2213     __m64 s = unpack_32_1x64 (src);
2214     __m64 d = unpack_32_1x64 (dst);
2215     __m64 sa = expand_alpha_1x64 (s);
2216     __m64 da = expand_alpha_1x64 (d);
2217
2218     s = pix_multiply_1x64 (s, m);
2219     m = negate_1x64 (pix_multiply_1x64 (m, sa));
2220
2221     return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2222 }
2223
2224 static force_inline void
2225 core_combine_atop_ca_sse2 (uint32_t *      pd,
2226                            const uint32_t *ps,
2227                            const uint32_t *pm,
2228                            int             w)
2229 {
2230     uint32_t s, m, d;
2231
2232     __m128i xmm_src_lo, xmm_src_hi;
2233     __m128i xmm_dst_lo, xmm_dst_hi;
2234     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2235     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2236     __m128i xmm_mask_lo, xmm_mask_hi;
2237
2238     /* call prefetch hint to optimize cache load*/
2239     cache_prefetch ((__m128i*)ps);
2240     cache_prefetch ((__m128i*)pd);
2241     cache_prefetch ((__m128i*)pm);
2242
2243     while (w && (unsigned long)pd & 15)
2244     {
2245         s = *ps++;
2246         m = *pm++;
2247         d = *pd;
2248
2249         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2250         w--;
2251     }
2252
2253     /* call prefetch hint to optimize cache load*/
2254     cache_prefetch ((__m128i*)ps);
2255     cache_prefetch ((__m128i*)pd);
2256     cache_prefetch ((__m128i*)pm);
2257
2258     while (w >= 4)
2259     {
2260         /* fill cache line with next memory */
2261         cache_prefetch_next ((__m128i*)ps);
2262         cache_prefetch_next ((__m128i*)pd);
2263         cache_prefetch_next ((__m128i*)pm);
2264
2265         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2266         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2267         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2268
2269         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2270         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2271         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2272
2273         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2274                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2275         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2276                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2277
2278         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2279                             &xmm_mask_lo, &xmm_mask_hi,
2280                             &xmm_src_lo, &xmm_src_hi);
2281         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2282                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2283                             &xmm_mask_lo, &xmm_mask_hi);
2284
2285         negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2286
2287         pix_add_multiply_2x128 (
2288             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2289             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2290             &xmm_dst_lo, &xmm_dst_hi);
2291
2292         save_128_aligned (
2293             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2294
2295         ps += 4;
2296         pd += 4;
2297         pm += 4;
2298         w -= 4;
2299     }
2300
2301     while (w)
2302     {
2303         s = *ps++;
2304         m = *pm++;
2305         d = *pd;
2306
2307         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2308         w--;
2309     }
2310 }
2311
2312 static force_inline uint32_t
2313 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2314                                          uint32_t mask,
2315                                          uint32_t dst)
2316 {
2317     __m64 m = unpack_32_1x64 (mask);
2318     __m64 s = unpack_32_1x64 (src);
2319     __m64 d = unpack_32_1x64 (dst);
2320
2321     __m64 da = negate_1x64 (expand_alpha_1x64 (d));
2322     __m64 sa = expand_alpha_1x64 (s);
2323
2324     s = pix_multiply_1x64 (s, m);
2325     m = pix_multiply_1x64 (m, sa);
2326
2327     return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2328 }
2329
2330 static force_inline void
2331 core_combine_reverse_atop_ca_sse2 (uint32_t *      pd,
2332                                    const uint32_t *ps,
2333                                    const uint32_t *pm,
2334                                    int             w)
2335 {
2336     uint32_t s, m, d;
2337
2338     __m128i xmm_src_lo, xmm_src_hi;
2339     __m128i xmm_dst_lo, xmm_dst_hi;
2340     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2341     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2342     __m128i xmm_mask_lo, xmm_mask_hi;
2343
2344     /* call prefetch hint to optimize cache load*/
2345     cache_prefetch ((__m128i*)ps);
2346     cache_prefetch ((__m128i*)pd);
2347     cache_prefetch ((__m128i*)pm);
2348
2349     while (w && (unsigned long)pd & 15)
2350     {
2351         s = *ps++;
2352         m = *pm++;
2353         d = *pd;
2354
2355         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2356         w--;
2357     }
2358
2359     /* call prefetch hint to optimize cache load*/
2360     cache_prefetch ((__m128i*)ps);
2361     cache_prefetch ((__m128i*)pd);
2362     cache_prefetch ((__m128i*)pm);
2363
2364     while (w >= 4)
2365     {
2366         /* fill cache line with next memory */
2367         cache_prefetch_next ((__m128i*)ps);
2368         cache_prefetch_next ((__m128i*)pd);
2369         cache_prefetch_next ((__m128i*)pm);
2370
2371         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2372         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2373         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2374
2375         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2376         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2377         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2378
2379         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2380                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2381         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2382                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2383
2384         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2385                             &xmm_mask_lo, &xmm_mask_hi,
2386                             &xmm_src_lo, &xmm_src_hi);
2387         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2388                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2389                             &xmm_mask_lo, &xmm_mask_hi);
2390
2391         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2392                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2393
2394         pix_add_multiply_2x128 (
2395             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2396             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2397             &xmm_dst_lo, &xmm_dst_hi);
2398
2399         save_128_aligned (
2400             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2401
2402         ps += 4;
2403         pd += 4;
2404         pm += 4;
2405         w -= 4;
2406     }
2407
2408     while (w)
2409     {
2410         s = *ps++;
2411         m = *pm++;
2412         d = *pd;
2413
2414         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2415         w--;
2416     }
2417 }
2418
2419 static force_inline uint32_t
2420 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2421                                 uint32_t mask,
2422                                 uint32_t dst)
2423 {
2424     __m64 a = unpack_32_1x64 (mask);
2425     __m64 s = unpack_32_1x64 (src);
2426     __m64 d = unpack_32_1x64 (dst);
2427
2428     __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
2429                                        a, expand_alpha_1x64 (s)));
2430     __m64 dest      = pix_multiply_1x64 (s, a);
2431     __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
2432
2433     return pack_1x64_32 (pix_add_multiply_1x64 (&d,
2434                                                 &alpha_dst,
2435                                                 &dest,
2436                                                 &alpha_src));
2437 }
2438
2439 static force_inline void
2440 core_combine_xor_ca_sse2 (uint32_t *      pd,
2441                           const uint32_t *ps,
2442                           const uint32_t *pm,
2443                           int             w)
2444 {
2445     uint32_t s, m, d;
2446
2447     __m128i xmm_src_lo, xmm_src_hi;
2448     __m128i xmm_dst_lo, xmm_dst_hi;
2449     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2450     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2451     __m128i xmm_mask_lo, xmm_mask_hi;
2452
2453     /* call prefetch hint to optimize cache load*/
2454     cache_prefetch ((__m128i*)ps);
2455     cache_prefetch ((__m128i*)pd);
2456     cache_prefetch ((__m128i*)pm);
2457
2458     while (w && (unsigned long)pd & 15)
2459     {
2460         s = *ps++;
2461         m = *pm++;
2462         d = *pd;
2463
2464         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2465         w--;
2466     }
2467
2468     /* call prefetch hint to optimize cache load*/
2469     cache_prefetch ((__m128i*)ps);
2470     cache_prefetch ((__m128i*)pd);
2471     cache_prefetch ((__m128i*)pm);
2472
2473     while (w >= 4)
2474     {
2475         /* fill cache line with next memory */
2476         cache_prefetch_next ((__m128i*)ps);
2477         cache_prefetch_next ((__m128i*)pd);
2478         cache_prefetch_next ((__m128i*)pm);
2479
2480         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2481         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2482         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2483
2484         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2485         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2486         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2487
2488         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2489                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2490         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2491                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2492
2493         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2494                             &xmm_mask_lo, &xmm_mask_hi,
2495                             &xmm_src_lo, &xmm_src_hi);
2496         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2497                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2498                             &xmm_mask_lo, &xmm_mask_hi);
2499
2500         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2501                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2502         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2503                       &xmm_mask_lo, &xmm_mask_hi);
2504
2505         pix_add_multiply_2x128 (
2506             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2507             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2508             &xmm_dst_lo, &xmm_dst_hi);
2509
2510         save_128_aligned (
2511             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2512
2513         ps += 4;
2514         pd += 4;
2515         pm += 4;
2516         w -= 4;
2517     }
2518
2519     while (w)
2520     {
2521         s = *ps++;
2522         m = *pm++;
2523         d = *pd;
2524
2525         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2526         w--;
2527     }
2528 }
2529
2530 static force_inline void
2531 core_combine_add_ca_sse2 (uint32_t *      pd,
2532                           const uint32_t *ps,
2533                           const uint32_t *pm,
2534                           int             w)
2535 {
2536     uint32_t s, m, d;
2537
2538     __m128i xmm_src_lo, xmm_src_hi;
2539     __m128i xmm_dst_lo, xmm_dst_hi;
2540     __m128i xmm_mask_lo, xmm_mask_hi;
2541
2542     /* call prefetch hint to optimize cache load*/
2543     cache_prefetch ((__m128i*)ps);
2544     cache_prefetch ((__m128i*)pd);
2545     cache_prefetch ((__m128i*)pm);
2546
2547     while (w && (unsigned long)pd & 15)
2548     {
2549         s = *ps++;
2550         m = *pm++;
2551         d = *pd;
2552
2553         *pd++ = pack_1x64_32 (
2554             _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2555                                              unpack_32_1x64 (m)),
2556                           unpack_32_1x64 (d)));
2557         w--;
2558     }
2559
2560     /* call prefetch hint to optimize cache load*/
2561     cache_prefetch ((__m128i*)ps);
2562     cache_prefetch ((__m128i*)pd);
2563     cache_prefetch ((__m128i*)pm);
2564
2565     while (w >= 4)
2566     {
2567         /* fill cache line with next memory */
2568         cache_prefetch_next ((__m128i*)ps);
2569         cache_prefetch_next ((__m128i*)pd);
2570         cache_prefetch_next ((__m128i*)pm);
2571
2572         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2573         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2574         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2575
2576         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2577         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2578         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2579
2580         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2581                             &xmm_mask_lo, &xmm_mask_hi,
2582                             &xmm_src_lo, &xmm_src_hi);
2583
2584         save_128_aligned (
2585             (__m128i*)pd, pack_2x128_128 (
2586                 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2587                 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2588
2589         ps += 4;
2590         pd += 4;
2591         pm += 4;
2592         w -= 4;
2593     }
2594
2595     while (w)
2596     {
2597         s = *ps++;
2598         m = *pm++;
2599         d = *pd;
2600
2601         *pd++ = pack_1x64_32 (
2602             _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2603                                              unpack_32_1x64 (m)),
2604                           unpack_32_1x64 (d)));
2605         w--;
2606     }
2607 }
2608
2609 /* ---------------------------------------------------
2610  * fb_compose_setup_sSE2
2611  */
2612 static force_inline __m64
2613 create_mask_16_64 (uint16_t mask)
2614 {
2615     return _mm_set1_pi16 (mask);
2616 }
2617
2618 static force_inline __m128i
2619 create_mask_16_128 (uint16_t mask)
2620 {
2621     return _mm_set1_epi16 (mask);
2622 }
2623
2624 static force_inline __m64
2625 create_mask_2x32_64 (uint32_t mask0,
2626                      uint32_t mask1)
2627 {
2628     return _mm_set_pi32 (mask0, mask1);
2629 }
2630
2631 /* Work around a code generation bug in Sun Studio 12. */
2632 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2633 # define create_mask_2x32_128(mask0, mask1) \
2634         (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2635 #else
2636 static force_inline __m128i
2637 create_mask_2x32_128 (uint32_t mask0,
2638                       uint32_t mask1)
2639 {
2640     return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2641 }
2642 #endif
2643
2644 /* SSE2 code patch for fbcompose.c */
2645
2646 static void
2647 sse2_combine_over_u (pixman_implementation_t *imp,
2648                      pixman_op_t              op,
2649                      uint32_t *               dst,
2650                      const uint32_t *         src,
2651                      const uint32_t *         mask,
2652                      int                      width)
2653 {
2654     core_combine_over_u_sse2 (dst, src, mask, width);
2655     _mm_empty ();
2656 }
2657
2658 static void
2659 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
2660                              pixman_op_t              op,
2661                              uint32_t *               dst,
2662                              const uint32_t *         src,
2663                              const uint32_t *         mask,
2664                              int                      width)
2665 {
2666     core_combine_over_reverse_u_sse2 (dst, src, mask, width);
2667     _mm_empty ();
2668 }
2669
2670 static void
2671 sse2_combine_in_u (pixman_implementation_t *imp,
2672                    pixman_op_t              op,
2673                    uint32_t *               dst,
2674                    const uint32_t *         src,
2675                    const uint32_t *         mask,
2676                    int                      width)
2677 {
2678     core_combine_in_u_sse2 (dst, src, mask, width);
2679     _mm_empty ();
2680 }
2681
2682 static void
2683 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
2684                            pixman_op_t              op,
2685                            uint32_t *               dst,
2686                            const uint32_t *         src,
2687                            const uint32_t *         mask,
2688                            int                      width)
2689 {
2690     core_combine_reverse_in_u_sse2 (dst, src, mask, width);
2691     _mm_empty ();
2692 }
2693
2694 static void
2695 sse2_combine_out_u (pixman_implementation_t *imp,
2696                     pixman_op_t              op,
2697                     uint32_t *               dst,
2698                     const uint32_t *         src,
2699                     const uint32_t *         mask,
2700                     int                      width)
2701 {
2702     core_combine_out_u_sse2 (dst, src, mask, width);
2703     _mm_empty ();
2704 }
2705
2706 static void
2707 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
2708                             pixman_op_t              op,
2709                             uint32_t *               dst,
2710                             const uint32_t *         src,
2711                             const uint32_t *         mask,
2712                             int                      width)
2713 {
2714     core_combine_reverse_out_u_sse2 (dst, src, mask, width);
2715     _mm_empty ();
2716 }
2717
2718 static void
2719 sse2_combine_atop_u (pixman_implementation_t *imp,
2720                      pixman_op_t              op,
2721                      uint32_t *               dst,
2722                      const uint32_t *         src,
2723                      const uint32_t *         mask,
2724                      int                      width)
2725 {
2726     core_combine_atop_u_sse2 (dst, src, mask, width);
2727     _mm_empty ();
2728 }
2729
2730 static void
2731 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
2732                              pixman_op_t              op,
2733                              uint32_t *               dst,
2734                              const uint32_t *         src,
2735                              const uint32_t *         mask,
2736                              int                      width)
2737 {
2738     core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
2739     _mm_empty ();
2740 }
2741
2742 static void
2743 sse2_combine_xor_u (pixman_implementation_t *imp,
2744                     pixman_op_t              op,
2745                     uint32_t *               dst,
2746                     const uint32_t *         src,
2747                     const uint32_t *         mask,
2748                     int                      width)
2749 {
2750     core_combine_xor_u_sse2 (dst, src, mask, width);
2751     _mm_empty ();
2752 }
2753
2754 static void
2755 sse2_combine_add_u (pixman_implementation_t *imp,
2756                     pixman_op_t              op,
2757                     uint32_t *               dst,
2758                     const uint32_t *         src,
2759                     const uint32_t *         mask,
2760                     int                      width)
2761 {
2762     core_combine_add_u_sse2 (dst, src, mask, width);
2763     _mm_empty ();
2764 }
2765
2766 static void
2767 sse2_combine_saturate_u (pixman_implementation_t *imp,
2768                          pixman_op_t              op,
2769                          uint32_t *               dst,
2770                          const uint32_t *         src,
2771                          const uint32_t *         mask,
2772                          int                      width)
2773 {
2774     core_combine_saturate_u_sse2 (dst, src, mask, width);
2775     _mm_empty ();
2776 }
2777
2778 static void
2779 sse2_combine_src_ca (pixman_implementation_t *imp,
2780                      pixman_op_t              op,
2781                      uint32_t *               dst,
2782                      const uint32_t *         src,
2783                      const uint32_t *         mask,
2784                      int                      width)
2785 {
2786     core_combine_src_ca_sse2 (dst, src, mask, width);
2787     _mm_empty ();
2788 }
2789
2790 static void
2791 sse2_combine_over_ca (pixman_implementation_t *imp,
2792                       pixman_op_t              op,
2793                       uint32_t *               dst,
2794                       const uint32_t *         src,
2795                       const uint32_t *         mask,
2796                       int                      width)
2797 {
2798     core_combine_over_ca_sse2 (dst, src, mask, width);
2799     _mm_empty ();
2800 }
2801
2802 static void
2803 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
2804                               pixman_op_t              op,
2805                               uint32_t *               dst,
2806                               const uint32_t *         src,
2807                               const uint32_t *         mask,
2808                               int                      width)
2809 {
2810     core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
2811     _mm_empty ();
2812 }
2813
2814 static void
2815 sse2_combine_in_ca (pixman_implementation_t *imp,
2816                     pixman_op_t              op,
2817                     uint32_t *               dst,
2818                     const uint32_t *         src,
2819                     const uint32_t *         mask,
2820                     int                      width)
2821 {
2822     core_combine_in_ca_sse2 (dst, src, mask, width);
2823     _mm_empty ();
2824 }
2825
2826 static void
2827 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
2828                             pixman_op_t              op,
2829                             uint32_t *               dst,
2830                             const uint32_t *         src,
2831                             const uint32_t *         mask,
2832                             int                      width)
2833 {
2834     core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
2835     _mm_empty ();
2836 }
2837
2838 static void
2839 sse2_combine_out_ca (pixman_implementation_t *imp,
2840                      pixman_op_t              op,
2841                      uint32_t *               dst,
2842                      const uint32_t *         src,
2843                      const uint32_t *         mask,
2844                      int                      width)
2845 {
2846     core_combine_out_ca_sse2 (dst, src, mask, width);
2847     _mm_empty ();
2848 }
2849
2850 static void
2851 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
2852                              pixman_op_t              op,
2853                              uint32_t *               dst,
2854                              const uint32_t *         src,
2855                              const uint32_t *         mask,
2856                              int                      width)
2857 {
2858     core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
2859     _mm_empty ();
2860 }
2861
2862 static void
2863 sse2_combine_atop_ca (pixman_implementation_t *imp,
2864                       pixman_op_t              op,
2865                       uint32_t *               dst,
2866                       const uint32_t *         src,
2867                       const uint32_t *         mask,
2868                       int                      width)
2869 {
2870     core_combine_atop_ca_sse2 (dst, src, mask, width);
2871     _mm_empty ();
2872 }
2873
2874 static void
2875 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2876                               pixman_op_t              op,
2877                               uint32_t *               dst,
2878                               const uint32_t *         src,
2879                               const uint32_t *         mask,
2880                               int                      width)
2881 {
2882     core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
2883     _mm_empty ();
2884 }
2885
2886 static void
2887 sse2_combine_xor_ca (pixman_implementation_t *imp,
2888                      pixman_op_t              op,
2889                      uint32_t *               dst,
2890                      const uint32_t *         src,
2891                      const uint32_t *         mask,
2892                      int                      width)
2893 {
2894     core_combine_xor_ca_sse2 (dst, src, mask, width);
2895     _mm_empty ();
2896 }
2897
2898 static void
2899 sse2_combine_add_ca (pixman_implementation_t *imp,
2900                      pixman_op_t              op,
2901                      uint32_t *               dst,
2902                      const uint32_t *         src,
2903                      const uint32_t *         mask,
2904                      int                      width)
2905 {
2906     core_combine_add_ca_sse2 (dst, src, mask, width);
2907     _mm_empty ();
2908 }
2909
2910 /* -------------------------------------------------------------------
2911  * composite_over_n_8888
2912  */
2913
2914 static void
2915 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2916                             pixman_op_t              op,
2917                             pixman_image_t *         src_image,
2918                             pixman_image_t *         mask_image,
2919                             pixman_image_t *         dst_image,
2920                             int32_t                  src_x,
2921                             int32_t                  src_y,
2922                             int32_t                  mask_x,
2923                             int32_t                  mask_y,
2924                             int32_t                  dest_x,
2925                             int32_t                  dest_y,
2926                             int32_t                  width,
2927                             int32_t                  height)
2928 {
2929     uint32_t src;
2930     uint32_t    *dst_line, *dst, d;
2931     uint16_t w;
2932     int dst_stride;
2933     __m128i xmm_src, xmm_alpha;
2934     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2935
2936     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2937
2938     if (src == 0)
2939         return;
2940
2941     PIXMAN_IMAGE_GET_LINE (
2942         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2943
2944     xmm_src = expand_pixel_32_1x128 (src);
2945     xmm_alpha = expand_alpha_1x128 (xmm_src);
2946
2947     while (height--)
2948     {
2949         dst = dst_line;
2950
2951         /* call prefetch hint to optimize cache load*/
2952         cache_prefetch ((__m128i*)dst);
2953
2954         dst_line += dst_stride;
2955         w = width;
2956
2957         while (w && (unsigned long)dst & 15)
2958         {
2959             d = *dst;
2960             *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2961                                               _mm_movepi64_pi64 (xmm_alpha),
2962                                               unpack_32_1x64 (d)));
2963             w--;
2964         }
2965
2966         cache_prefetch ((__m128i*)dst);
2967
2968         while (w >= 4)
2969         {
2970             /* fill cache line with next memory */
2971             cache_prefetch_next ((__m128i*)dst);
2972
2973             xmm_dst = load_128_aligned ((__m128i*)dst);
2974
2975             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2976
2977             over_2x128 (&xmm_src, &xmm_src,
2978                         &xmm_alpha, &xmm_alpha,
2979                         &xmm_dst_lo, &xmm_dst_hi);
2980
2981             /* rebuid the 4 pixel data and save*/
2982             save_128_aligned (
2983                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2984
2985             w -= 4;
2986             dst += 4;
2987         }
2988
2989         while (w)
2990         {
2991             d = *dst;
2992             *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2993                                               _mm_movepi64_pi64 (xmm_alpha),
2994                                               unpack_32_1x64 (d)));
2995             w--;
2996         }
2997
2998     }
2999     _mm_empty ();
3000 }
3001
3002 /* ---------------------------------------------------------------------
3003  * composite_over_n_0565
3004  */
3005 static void
3006 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
3007                             pixman_op_t              op,
3008                             pixman_image_t *         src_image,
3009                             pixman_image_t *         mask_image,
3010                             pixman_image_t *         dst_image,
3011                             int32_t                  src_x,
3012                             int32_t                  src_y,
3013                             int32_t                  mask_x,
3014                             int32_t                  mask_y,
3015                             int32_t                  dest_x,
3016                             int32_t                  dest_y,
3017                             int32_t                  width,
3018                             int32_t                  height)
3019 {
3020     uint32_t src;
3021     uint16_t    *dst_line, *dst, d;
3022     uint16_t w;
3023     int dst_stride;
3024     __m128i xmm_src, xmm_alpha;
3025     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3026
3027     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3028
3029     if (src == 0)
3030         return;
3031
3032     PIXMAN_IMAGE_GET_LINE (
3033         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3034
3035     xmm_src = expand_pixel_32_1x128 (src);
3036     xmm_alpha = expand_alpha_1x128 (xmm_src);
3037
3038     while (height--)
3039     {
3040         dst = dst_line;
3041
3042         /* call prefetch hint to optimize cache load*/
3043         cache_prefetch ((__m128i*)dst);
3044
3045         dst_line += dst_stride;
3046         w = width;
3047
3048         while (w && (unsigned long)dst & 15)
3049         {
3050             d = *dst;
3051
3052             *dst++ = pack_565_32_16 (
3053                 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3054                                          _mm_movepi64_pi64 (xmm_alpha),
3055                                          expand565_16_1x64 (d))));
3056             w--;
3057         }
3058
3059         /* call prefetch hint to optimize cache load*/
3060         cache_prefetch ((__m128i*)dst);
3061
3062         while (w >= 8)
3063         {
3064             /* fill cache line with next memory */
3065             cache_prefetch_next ((__m128i*)dst);
3066
3067             xmm_dst = load_128_aligned ((__m128i*)dst);
3068
3069             unpack_565_128_4x128 (xmm_dst,
3070                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3071
3072             over_2x128 (&xmm_src, &xmm_src,
3073                         &xmm_alpha, &xmm_alpha,
3074                         &xmm_dst0, &xmm_dst1);
3075             over_2x128 (&xmm_src, &xmm_src,
3076                         &xmm_alpha, &xmm_alpha,
3077                         &xmm_dst2, &xmm_dst3);
3078
3079             xmm_dst = pack_565_4x128_128 (
3080                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3081
3082             save_128_aligned ((__m128i*)dst, xmm_dst);
3083
3084             dst += 8;
3085             w -= 8;
3086         }
3087
3088         while (w--)
3089         {
3090             d = *dst;
3091             *dst++ = pack_565_32_16 (
3092                 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3093                                          _mm_movepi64_pi64 (xmm_alpha),
3094                                          expand565_16_1x64 (d))));
3095         }
3096     }
3097
3098     _mm_empty ();
3099 }
3100
3101 /* ------------------------------
3102  * composite_add_n_8888_8888_ca
3103  */
3104 static void
3105 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
3106                                    pixman_op_t              op,
3107                                    pixman_image_t *         src_image,
3108                                    pixman_image_t *         mask_image,
3109                                    pixman_image_t *         dst_image,
3110                                    int32_t                  src_x,
3111                                    int32_t                  src_y,
3112                                    int32_t                  mask_x,
3113                                    int32_t                  mask_y,
3114                                    int32_t                  dest_x,
3115                                    int32_t                  dest_y,
3116                                    int32_t                  width,
3117                                    int32_t                  height)
3118 {
3119     uint32_t src, srca;
3120     uint32_t    *dst_line, d;
3121     uint32_t    *mask_line, m;
3122     uint32_t pack_cmp;
3123     int dst_stride, mask_stride;
3124
3125     __m128i xmm_src, xmm_alpha;
3126     __m128i xmm_dst;
3127     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3128
3129     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3130
3131     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3132     srca = src >> 24;
3133     
3134     if (src == 0)
3135         return;
3136
3137     PIXMAN_IMAGE_GET_LINE (
3138         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3139     PIXMAN_IMAGE_GET_LINE (
3140         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3141
3142     xmm_src = _mm_unpacklo_epi8 (
3143         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3144     xmm_alpha = expand_alpha_1x128 (xmm_src);
3145     mmx_src   = _mm_movepi64_pi64 (xmm_src);
3146     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3147
3148     while (height--)
3149     {
3150         int w = width;
3151         const uint32_t *pm = (uint32_t *)mask_line;
3152         uint32_t *pd = (uint32_t *)dst_line;
3153
3154         dst_line += dst_stride;
3155         mask_line += mask_stride;
3156
3157         /* call prefetch hint to optimize cache load*/
3158         cache_prefetch ((__m128i*)pd);
3159         cache_prefetch ((__m128i*)pm);
3160
3161         while (w && (unsigned long)pd & 15)
3162         {
3163             m = *pm++;
3164
3165             if (m)
3166             {
3167                 d = *pd;
3168                 
3169                 mmx_mask = unpack_32_1x64 (m);
3170                 mmx_dest = unpack_32_1x64 (d);
3171
3172                 *pd = pack_1x64_32 (
3173                     _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
3174             }
3175
3176             pd++;
3177             w--;
3178         }
3179
3180         /* call prefetch hint to optimize cache load*/
3181         cache_prefetch ((__m128i*)pd);
3182         cache_prefetch ((__m128i*)pm);
3183
3184         while (w >= 4)
3185         {
3186             /* fill cache line with next memory */
3187             cache_prefetch_next ((__m128i*)pd);
3188             cache_prefetch_next ((__m128i*)pm);
3189
3190             xmm_mask = load_128_unaligned ((__m128i*)pm);
3191
3192             pack_cmp =
3193                 _mm_movemask_epi8 (
3194                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3195
3196             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3197             if (pack_cmp != 0xffff)
3198             {
3199                 xmm_dst = load_128_aligned ((__m128i*)pd);
3200
3201                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3202
3203                 pix_multiply_2x128 (&xmm_src, &xmm_src,
3204                                     &xmm_mask_lo, &xmm_mask_hi,
3205                                     &xmm_mask_lo, &xmm_mask_hi);
3206                 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
3207                 
3208                 save_128_aligned (
3209                     (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
3210             }
3211
3212             pd += 4;
3213             pm += 4;
3214             w -= 4;
3215         }
3216
3217         while (w)
3218         {
3219             m = *pm++;
3220
3221             if (m)
3222             {
3223                 d = *pd;
3224                 
3225                 mmx_mask = unpack_32_1x64 (m);
3226                 mmx_dest = unpack_32_1x64 (d);
3227
3228                 *pd = pack_1x64_32 (
3229                     _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
3230             }
3231
3232             pd++;
3233             w--;
3234         }
3235     }
3236
3237     _mm_empty ();
3238 }
3239
3240 /* ---------------------------------------------------------------------------
3241  * composite_over_n_8888_8888_ca
3242  */
3243
3244 static void
3245 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
3246                                     pixman_op_t              op,
3247                                     pixman_image_t *         src_image,
3248                                     pixman_image_t *         mask_image,
3249                                     pixman_image_t *         dst_image,
3250                                     int32_t                  src_x,
3251                                     int32_t                  src_y,
3252                                     int32_t                  mask_x,
3253                                     int32_t                  mask_y,
3254                                     int32_t                  dest_x,
3255                                     int32_t                  dest_y,
3256                                     int32_t                  width,
3257                                     int32_t                  height)
3258 {
3259     uint32_t src;
3260     uint32_t    *dst_line, d;
3261     uint32_t    *mask_line, m;
3262     uint32_t pack_cmp;
3263     int dst_stride, mask_stride;
3264
3265     __m128i xmm_src, xmm_alpha;
3266     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3267     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3268
3269     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3270
3271     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3272
3273     if (src == 0)
3274         return;
3275
3276     PIXMAN_IMAGE_GET_LINE (
3277         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3278     PIXMAN_IMAGE_GET_LINE (
3279         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3280
3281     xmm_src = _mm_unpacklo_epi8 (
3282         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3283     xmm_alpha = expand_alpha_1x128 (xmm_src);
3284     mmx_src   = _mm_movepi64_pi64 (xmm_src);
3285     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3286
3287     while (height--)
3288     {
3289         int w = width;
3290         const uint32_t *pm = (uint32_t *)mask_line;
3291         uint32_t *pd = (uint32_t *)dst_line;
3292
3293         dst_line += dst_stride;
3294         mask_line += mask_stride;
3295
3296         /* call prefetch hint to optimize cache load*/
3297         cache_prefetch ((__m128i*)pd);
3298         cache_prefetch ((__m128i*)pm);
3299
3300         while (w && (unsigned long)pd & 15)
3301         {
3302             m = *pm++;
3303
3304             if (m)
3305             {
3306                 d = *pd;
3307                 mmx_mask = unpack_32_1x64 (m);
3308                 mmx_dest = unpack_32_1x64 (d);
3309
3310                 *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
3311                                                   &mmx_alpha,
3312                                                   &mmx_mask,
3313                                                   &mmx_dest));
3314             }
3315
3316             pd++;
3317             w--;
3318         }
3319
3320         /* call prefetch hint to optimize cache load*/
3321         cache_prefetch ((__m128i*)pd);
3322         cache_prefetch ((__m128i*)pm);
3323
3324         while (w >= 4)
3325         {
3326             /* fill cache line with next memory */
3327             cache_prefetch_next ((__m128i*)pd);
3328             cache_prefetch_next ((__m128i*)pm);
3329
3330             xmm_mask = load_128_unaligned ((__m128i*)pm);
3331
3332             pack_cmp =
3333                 _mm_movemask_epi8 (
3334                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3335
3336             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3337             if (pack_cmp != 0xffff)
3338             {
3339                 xmm_dst = load_128_aligned ((__m128i*)pd);
3340
3341                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3342                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3343
3344                 in_over_2x128 (&xmm_src, &xmm_src,
3345                                &xmm_alpha, &xmm_alpha,
3346                                &xmm_mask_lo, &xmm_mask_hi,
3347                                &xmm_dst_lo, &xmm_dst_hi);
3348
3349                 save_128_aligned (
3350                     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3351             }
3352
3353             pd += 4;
3354             pm += 4;
3355             w -= 4;
3356         }
3357
3358         while (w)
3359         {
3360             m = *pm++;
3361
3362             if (m)
3363             {
3364                 d = *pd;
3365                 mmx_mask = unpack_32_1x64 (m);
3366                 mmx_dest = unpack_32_1x64 (d);
3367
3368                 *pd = pack_1x64_32 (
3369                     in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
3370             }
3371
3372             pd++;
3373             w--;
3374         }
3375     }
3376
3377     _mm_empty ();
3378 }
3379
3380 /*---------------------------------------------------------------------
3381  * composite_over_8888_n_8888
3382  */
3383
3384 static void
3385 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
3386                                  pixman_op_t              op,
3387                                  pixman_image_t *         src_image,
3388                                  pixman_image_t *         mask_image,
3389                                  pixman_image_t *         dst_image,
3390                                  int32_t                  src_x,
3391                                  int32_t                  src_y,
3392                                  int32_t                  mask_x,
3393                                  int32_t                  mask_y,
3394                                  int32_t                  dest_x,
3395                                  int32_t                  dest_y,
3396                                  int32_t                  width,
3397                                  int32_t                  height)
3398 {
3399     uint32_t    *dst_line, *dst;
3400     uint32_t    *src_line, *src;
3401     uint32_t mask;
3402     uint16_t w;
3403     int dst_stride, src_stride;
3404
3405     __m128i xmm_mask;
3406     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3407     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3408     __m128i xmm_alpha_lo, xmm_alpha_hi;
3409
3410     PIXMAN_IMAGE_GET_LINE (
3411         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3412     PIXMAN_IMAGE_GET_LINE (
3413         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3414
3415     mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
3416
3417     xmm_mask = create_mask_16_128 (mask >> 24);
3418
3419     while (height--)
3420     {
3421         dst = dst_line;
3422         dst_line += dst_stride;
3423         src = src_line;
3424         src_line += src_stride;
3425         w = width;
3426
3427         /* call prefetch hint to optimize cache load*/
3428         cache_prefetch ((__m128i*)dst);
3429         cache_prefetch ((__m128i*)src);
3430
3431         while (w && (unsigned long)dst & 15)
3432         {
3433             uint32_t s = *src++;
3434             uint32_t d = *dst;
3435
3436             __m64 ms = unpack_32_1x64 (s);
3437             __m64 alpha    = expand_alpha_1x64 (ms);
3438             __m64 dest     = _mm_movepi64_pi64 (xmm_mask);
3439             __m64 alpha_dst = unpack_32_1x64 (d);
3440
3441             *dst++ = pack_1x64_32 (
3442                 in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
3443
3444             w--;
3445         }
3446
3447         /* call prefetch hint to optimize cache load*/
3448         cache_prefetch ((__m128i*)dst);
3449         cache_prefetch ((__m128i*)src);
3450
3451         while (w >= 4)
3452         {
3453             /* fill cache line with next memory */
3454             cache_prefetch_next ((__m128i*)dst);
3455             cache_prefetch_next ((__m128i*)src);
3456
3457             xmm_src = load_128_unaligned ((__m128i*)src);
3458             xmm_dst = load_128_aligned ((__m128i*)dst);
3459
3460             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3461             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3462             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3463                                 &xmm_alpha_lo, &xmm_alpha_hi);
3464
3465             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3466                            &xmm_alpha_lo, &xmm_alpha_hi,
3467                            &xmm_mask, &xmm_mask,
3468                            &xmm_dst_lo, &xmm_dst_hi);
3469
3470             save_128_aligned (
3471                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3472
3473             dst += 4;
3474             src += 4;
3475             w -= 4;
3476         }
3477
3478         while (w)
3479         {
3480             uint32_t s = *src++;
3481             uint32_t d = *dst;
3482
3483             __m64 ms = unpack_32_1x64 (s);
3484             __m64 alpha = expand_alpha_1x64 (ms);
3485             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3486             __m64 dest  = unpack_32_1x64 (d);
3487
3488             *dst++ = pack_1x64_32 (
3489                 in_over_1x64 (&ms, &alpha, &mask, &dest));
3490
3491             w--;
3492         }
3493     }
3494
3495     _mm_empty ();
3496 }
3497
3498 /* ---------------------------------------------------------------------
3499  * composite_over_x888_n_8888
3500  */
3501 static void
3502 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3503                                  pixman_op_t              op,
3504                                  pixman_image_t *         src_image,
3505                                  pixman_image_t *         mask_image,
3506                                  pixman_image_t *         dst_image,
3507                                  int32_t                  src_x,
3508                                  int32_t                  src_y,
3509                                  int32_t                  mask_x,
3510                                  int32_t                  mask_y,
3511                                  int32_t                  dest_x,
3512                                  int32_t                  dest_y,
3513                                  int32_t                  width,
3514                                  int32_t                  height)
3515 {
3516     uint32_t    *dst_line, *dst;
3517     uint32_t    *src_line, *src;
3518     uint32_t mask;
3519     int dst_stride, src_stride;
3520     uint16_t w;
3521
3522     __m128i xmm_mask, xmm_alpha;
3523     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3524     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3525
3526     PIXMAN_IMAGE_GET_LINE (
3527         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3528     PIXMAN_IMAGE_GET_LINE (
3529         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3530
3531     mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
3532
3533     xmm_mask = create_mask_16_128 (mask >> 24);
3534     xmm_alpha = mask_00ff;
3535
3536     while (height--)
3537     {
3538         dst = dst_line;
3539         dst_line += dst_stride;
3540         src = src_line;
3541         src_line += src_stride;
3542         w = width;
3543
3544         /* call prefetch hint to optimize cache load*/
3545         cache_prefetch ((__m128i*)dst);
3546         cache_prefetch ((__m128i*)src);
3547
3548         while (w && (unsigned long)dst & 15)
3549         {
3550             uint32_t s = (*src++) | 0xff000000;
3551             uint32_t d = *dst;
3552
3553             __m64 src   = unpack_32_1x64 (s);
3554             __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3555             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3556             __m64 dest  = unpack_32_1x64 (d);
3557
3558             *dst++ = pack_1x64_32 (
3559                 in_over_1x64 (&src, &alpha, &mask, &dest));
3560
3561             w--;
3562         }
3563
3564         /* call prefetch hint to optimize cache load*/
3565         cache_prefetch ((__m128i*)dst);
3566         cache_prefetch ((__m128i*)src);
3567
3568         while (w >= 4)
3569         {
3570             /* fill cache line with next memory */
3571             cache_prefetch_next ((__m128i*)dst);
3572             cache_prefetch_next ((__m128i*)src);
3573
3574             xmm_src = _mm_or_si128 (
3575                 load_128_unaligned ((__m128i*)src), mask_ff000000);
3576             xmm_dst = load_128_aligned ((__m128i*)dst);
3577
3578             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3579             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3580
3581             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3582                            &xmm_alpha, &xmm_alpha,
3583                            &xmm_mask, &xmm_mask,
3584                            &xmm_dst_lo, &xmm_dst_hi);
3585
3586             save_128_aligned (
3587                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3588
3589             dst += 4;
3590             src += 4;
3591             w -= 4;
3592
3593         }
3594
3595         while (w)
3596         {
3597             uint32_t s = (*src++) | 0xff000000;
3598             uint32_t d = *dst;
3599
3600             __m64 src  = unpack_32_1x64 (s);
3601             __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3602             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3603             __m64 dest  = unpack_32_1x64 (d);
3604
3605             *dst++ = pack_1x64_32 (
3606                 in_over_1x64 (&src, &alpha, &mask, &dest));
3607
3608             w--;
3609         }
3610     }
3611
3612     _mm_empty ();
3613 }
3614
3615 /* --------------------------------------------------------------------
3616  * composite_over_8888_8888
3617  */
3618 static void
3619 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3620                                pixman_op_t              op,
3621                                pixman_image_t *         src_image,
3622                                pixman_image_t *         mask_image,
3623                                pixman_image_t *         dst_image,
3624                                int32_t                  src_x,
3625                                int32_t                  src_y,
3626                                int32_t                  mask_x,
3627                                int32_t                  mask_y,
3628                                int32_t                  dest_x,
3629                                int32_t                  dest_y,
3630                                int32_t                  width,
3631                                int32_t                  height)
3632 {
3633     int dst_stride, src_stride;
3634     uint32_t    *dst_line, *dst;
3635     uint32_t    *src_line, *src;
3636
3637     PIXMAN_IMAGE_GET_LINE (
3638         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3639     PIXMAN_IMAGE_GET_LINE (
3640         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3641
3642     dst = dst_line;
3643     src = src_line;
3644
3645     while (height--)
3646     {
3647         core_combine_over_u_sse2 (dst, src, NULL, width);
3648
3649         dst += dst_stride;
3650         src += src_stride;
3651     }
3652     _mm_empty ();
3653 }
3654
3655 /* ------------------------------------------------------------------
3656  * composite_over_8888_0565
3657  */
3658 static force_inline uint16_t
3659 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3660 {
3661     __m64 ms;
3662
3663     ms = unpack_32_1x64 (src);
3664     return pack_565_32_16 (
3665         pack_1x64_32 (
3666             over_1x64 (
3667                 ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
3668 }
3669
3670 static void
3671 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3672                                pixman_op_t              op,
3673                                pixman_image_t *         src_image,
3674                                pixman_image_t *         mask_image,
3675                                pixman_image_t *         dst_image,
3676                                int32_t                  src_x,
3677                                int32_t                  src_y,
3678                                int32_t                  mask_x,
3679                                int32_t                  mask_y,
3680                                int32_t                  dest_x,
3681                                int32_t                  dest_y,
3682                                int32_t                  width,
3683                                int32_t                  height)
3684 {
3685     uint16_t    *dst_line, *dst, d;
3686     uint32_t    *src_line, *src, s;
3687     int dst_stride, src_stride;
3688     uint16_t w;
3689
3690     __m128i xmm_alpha_lo, xmm_alpha_hi;
3691     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3692     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3693
3694     PIXMAN_IMAGE_GET_LINE (
3695         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3696     PIXMAN_IMAGE_GET_LINE (
3697         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3698
3699 #if 0
3700     /* FIXME
3701      *
3702      * I copy the code from MMX one and keep the fixme.
3703      * If it's a problem there, probably is a problem here.
3704      */
3705     assert (src_image->drawable == mask_image->drawable);
3706 #endif
3707
3708     while (height--)
3709     {
3710         dst = dst_line;
3711         src = src_line;
3712
3713         /* call prefetch hint to optimize cache load*/
3714         cache_prefetch ((__m128i*)src);
3715         cache_prefetch ((__m128i*)dst);
3716
3717         dst_line += dst_stride;
3718         src_line += src_stride;
3719         w = width;
3720
3721         /* Align dst on a 16-byte boundary */
3722         while (w &&
3723                ((unsigned long)dst & 15))
3724         {
3725             s = *src++;
3726             d = *dst;
3727
3728             *dst++ = composite_over_8888_0565pixel (s, d);
3729             w--;
3730         }
3731
3732         /* call prefetch hint to optimize cache load*/
3733         cache_prefetch ((__m128i*)src);
3734         cache_prefetch ((__m128i*)dst);
3735
3736         /* It's a 8 pixel loop */
3737         while (w >= 8)
3738         {
3739             /* fill cache line with next memory */
3740             cache_prefetch_next ((__m128i*)src);
3741             cache_prefetch_next ((__m128i*)dst);
3742
3743             /* I'm loading unaligned because I'm not sure
3744              * about the address alignment.
3745              */
3746             xmm_src = load_128_unaligned ((__m128i*) src);
3747             xmm_dst = load_128_aligned ((__m128i*) dst);
3748
3749             /* Unpacking */
3750             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3751             unpack_565_128_4x128 (xmm_dst,
3752                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3753             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3754                                 &xmm_alpha_lo, &xmm_alpha_hi);
3755
3756             /* I'm loading next 4 pixels from memory
3757              * before to optimze the memory read.
3758              */
3759             xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3760
3761             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3762                         &xmm_alpha_lo, &xmm_alpha_hi,
3763                         &xmm_dst0, &xmm_dst1);
3764
3765             /* Unpacking */
3766             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3767             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3768                                 &xmm_alpha_lo, &xmm_alpha_hi);
3769
3770             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3771                         &xmm_alpha_lo, &xmm_alpha_hi,
3772                         &xmm_dst2, &xmm_dst3);
3773
3774             save_128_aligned (
3775                 (__m128i*)dst, pack_565_4x128_128 (
3776                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3777
3778             w -= 8;
3779             dst += 8;
3780             src += 8;
3781         }
3782
3783         while (w--)
3784         {
3785             s = *src++;
3786             d = *dst;
3787
3788             *dst++ = composite_over_8888_0565pixel (s, d);
3789         }
3790     }
3791
3792     _mm_empty ();
3793 }
3794
3795 /* -----------------------------------------------------------------
3796  * composite_over_n_8_8888
3797  */
3798
3799 static void
3800 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3801                               pixman_op_t              op,
3802                               pixman_image_t *         src_image,
3803                               pixman_image_t *         mask_image,
3804                               pixman_image_t *         dst_image,
3805                               int32_t                  src_x,
3806                               int32_t                  src_y,
3807                               int32_t                  mask_x,
3808                               int32_t                  mask_y,
3809                               int32_t                  dest_x,
3810                               int32_t                  dest_y,
3811                               int32_t                  width,
3812                               int32_t                  height)
3813 {
3814     uint32_t src, srca;
3815     uint32_t *dst_line, *dst;
3816     uint8_t *mask_line, *mask;
3817     int dst_stride, mask_stride;
3818     uint16_t w;
3819     uint32_t m, d;
3820
3821     __m128i xmm_src, xmm_alpha, xmm_def;
3822     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3823     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3824
3825     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3826
3827     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3828
3829     srca = src >> 24;
3830     if (src == 0)
3831         return;
3832
3833     PIXMAN_IMAGE_GET_LINE (
3834         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3835     PIXMAN_IMAGE_GET_LINE (
3836         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3837
3838     xmm_def = create_mask_2x32_128 (src, src);
3839     xmm_src = expand_pixel_32_1x128 (src);
3840     xmm_alpha = expand_alpha_1x128 (xmm_src);
3841     mmx_src   = _mm_movepi64_pi64 (xmm_src);
3842     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3843
3844     while (height--)
3845     {
3846         dst = dst_line;
3847         dst_line += dst_stride;
3848         mask = mask_line;
3849         mask_line += mask_stride;
3850         w = width;
3851
3852         /* call prefetch hint to optimize cache load*/
3853         cache_prefetch ((__m128i*)mask);
3854         cache_prefetch ((__m128i*)dst);
3855
3856         while (w && (unsigned long)dst & 15)
3857         {
3858             uint8_t m = *mask++;
3859
3860             if (m)
3861             {
3862                 d = *dst;
3863                 mmx_mask = expand_pixel_8_1x64 (m);
3864                 mmx_dest = unpack_32_1x64 (d);
3865
3866                 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3867                                                    &mmx_alpha,
3868                                                    &mmx_mask,
3869                                                    &mmx_dest));
3870             }
3871
3872             w--;
3873             dst++;
3874         }
3875
3876         /* call prefetch hint to optimize cache load*/
3877         cache_prefetch ((__m128i*)mask);
3878         cache_prefetch ((__m128i*)dst);
3879
3880         while (w >= 4)
3881         {
3882             /* fill cache line with next memory */
3883             cache_prefetch_next ((__m128i*)mask);
3884             cache_prefetch_next ((__m128i*)dst);
3885
3886             m = *((uint32_t*)mask);
3887
3888             if (srca == 0xff && m == 0xffffffff)
3889             {
3890                 save_128_aligned ((__m128i*)dst, xmm_def);
3891             }
3892             else if (m)
3893             {
3894                 xmm_dst = load_128_aligned ((__m128i*) dst);
3895                 xmm_mask = unpack_32_1x128 (m);
3896                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3897
3898                 /* Unpacking */
3899                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3900                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3901
3902                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3903                                         &xmm_mask_lo, &xmm_mask_hi);
3904
3905                 in_over_2x128 (&xmm_src, &xmm_src,
3906                                &xmm_alpha, &xmm_alpha,
3907                                &xmm_mask_lo, &xmm_mask_hi,
3908                                &xmm_dst_lo, &xmm_dst_hi);
3909
3910                 save_128_aligned (
3911                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3912             }
3913
3914             w -= 4;
3915             dst += 4;
3916             mask += 4;
3917         }
3918
3919         while (w)
3920         {
3921             uint8_t m = *mask++;
3922
3923             if (m)
3924             {
3925                 d = *dst;
3926                 mmx_mask = expand_pixel_8_1x64 (m);
3927                 mmx_dest = unpack_32_1x64 (d);
3928
3929                 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3930                                                    &mmx_alpha,
3931                                                    &mmx_mask,
3932                                                    &mmx_dest));
3933             }
3934
3935             w--;
3936             dst++;
3937         }
3938     }
3939
3940     _mm_empty ();
3941 }
3942
3943 /* ----------------------------------------------------------------
3944  * composite_over_n_8_8888
3945  */
3946
3947 pixman_bool_t
3948 pixman_fill_sse2 (uint32_t *bits,
3949                   int       stride,
3950                   int       bpp,
3951                   int       x,
3952                   int       y,
3953                   int       width,
3954                   int       height,
3955                   uint32_t  data)
3956 {
3957     uint32_t byte_width;
3958     uint8_t         *byte_line;
3959
3960     __m128i xmm_def;
3961
3962     if (bpp == 16 && (data >> 16 != (data & 0xffff)))
3963         return FALSE;
3964
3965     if (bpp != 16 && bpp != 32)
3966         return FALSE;
3967
3968     if (bpp == 16)
3969     {
3970         stride = stride * (int) sizeof (uint32_t) / 2;
3971         byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3972         byte_width = 2 * width;
3973         stride *= 2;
3974     }
3975     else
3976     {
3977         stride = stride * (int) sizeof (uint32_t) / 4;
3978         byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3979         byte_width = 4 * width;
3980         stride *= 4;
3981     }
3982
3983     cache_prefetch ((__m128i*)byte_line);
3984     xmm_def = create_mask_2x32_128 (data, data);
3985
3986     while (height--)
3987     {
3988         int w;
3989         uint8_t *d = byte_line;
3990         byte_line += stride;
3991         w = byte_width;
3992
3993
3994         cache_prefetch_next ((__m128i*)d);
3995
3996         while (w >= 2 && ((unsigned long)d & 3))
3997         {
3998             *(uint16_t *)d = data;
3999             w -= 2;
4000             d += 2;
4001         }
4002
4003         while (w >= 4 && ((unsigned long)d & 15))
4004         {
4005             *(uint32_t *)d = data;
4006
4007             w -= 4;
4008             d += 4;
4009         }
4010
4011         cache_prefetch_next ((__m128i*)d);
4012
4013         while (w >= 128)
4014         {
4015             cache_prefetch (((__m128i*)d) + 12);
4016
4017             save_128_aligned ((__m128i*)(d),     xmm_def);
4018             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
4019             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
4020             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
4021             save_128_aligned ((__m128i*)(d + 64),  xmm_def);
4022             save_128_aligned ((__m128i*)(d + 80),  xmm_def);
4023             save_128_aligned ((__m128i*)(d + 96),  xmm_def);
4024             save_128_aligned ((__m128i*)(d + 112), xmm_def);
4025
4026             d += 128;
4027             w -= 128;
4028         }
4029
4030         if (w >= 64)
4031         {
4032             cache_prefetch (((__m128i*)d) + 8);
4033
4034             save_128_aligned ((__m128i*)(d),     xmm_def);
4035             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
4036             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
4037             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
4038
4039             d += 64;
4040             w -= 64;
4041         }
4042
4043         cache_prefetch_next ((__m128i*)d);
4044
4045         if (w >= 32)
4046         {
4047             save_128_aligned ((__m128i*)(d),     xmm_def);
4048             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
4049
4050             d += 32;
4051             w -= 32;
4052         }
4053
4054         if (w >= 16)
4055         {
4056             save_128_aligned ((__m128i*)(d),     xmm_def);
4057
4058             d += 16;
4059             w -= 16;
4060         }
4061
4062         cache_prefetch_next ((__m128i*)d);
4063
4064         while (w >= 4)
4065         {
4066             *(uint32_t *)d = data;
4067
4068             w -= 4;
4069             d += 4;
4070         }
4071
4072         if (w >= 2)
4073         {
4074             *(uint16_t *)d = data;
4075             w -= 2;
4076             d += 2;
4077         }
4078     }
4079
4080     _mm_empty ();
4081     return TRUE;
4082 }
4083
4084 static void
4085 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
4086                              pixman_op_t              op,
4087                              pixman_image_t *         src_image,
4088                              pixman_image_t *         mask_image,
4089                              pixman_image_t *         dst_image,
4090                              int32_t                  src_x,
4091                              int32_t                  src_y,
4092                              int32_t                  mask_x,
4093                              int32_t                  mask_y,
4094                              int32_t                  dest_x,
4095                              int32_t                  dest_y,
4096                              int32_t                  width,
4097                              int32_t                  height)
4098 {
4099     uint32_t src, srca;
4100     uint32_t    *dst_line, *dst;
4101     uint8_t     *mask_line, *mask;
4102     int dst_stride, mask_stride;
4103     uint16_t w;
4104     uint32_t m;
4105
4106     __m128i xmm_src, xmm_def;
4107     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4108
4109     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4110
4111     srca = src >> 24;
4112     if (src == 0)
4113     {
4114         pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
4115                           PIXMAN_FORMAT_BPP (dst_image->bits.format),
4116                           dest_x, dest_y, width, height, 0);
4117         return;
4118     }
4119
4120     PIXMAN_IMAGE_GET_LINE (
4121         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4122     PIXMAN_IMAGE_GET_LINE (
4123         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4124
4125     xmm_def = create_mask_2x32_128 (src, src);
4126     xmm_src = expand_pixel_32_1x128 (src);
4127
4128     while (height--)
4129     {
4130         dst = dst_line;
4131         dst_line += dst_stride;
4132         mask = mask_line;
4133         mask_line += mask_stride;
4134         w = width;
4135
4136         /* call prefetch hint to optimize cache load*/
4137         cache_prefetch ((__m128i*)mask);
4138         cache_prefetch ((__m128i*)dst);
4139
4140         while (w && (unsigned long)dst & 15)
4141         {
4142             uint8_t m = *mask++;
4143
4144             if (m)
4145             {
4146                 *dst = pack_1x64_32 (
4147                     pix_multiply_1x64 (
4148                         _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4149             }
4150             else
4151             {
4152                 *dst = 0;
4153             }
4154
4155             w--;
4156             dst++;
4157         }
4158
4159         /* call prefetch hint to optimize cache load*/
4160         cache_prefetch ((__m128i*)mask);
4161         cache_prefetch ((__m128i*)dst);
4162
4163         while (w >= 4)
4164         {
4165             /* fill cache line with next memory */
4166             cache_prefetch_next ((__m128i*)mask);
4167             cache_prefetch_next ((__m128i*)dst);
4168
4169             m = *((uint32_t*)mask);
4170
4171             if (srca == 0xff && m == 0xffffffff)
4172             {
4173                 save_128_aligned ((__m128i*)dst, xmm_def);
4174             }
4175             else if (m)
4176             {
4177                 xmm_mask = unpack_32_1x128 (m);
4178                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4179
4180                 /* Unpacking */
4181                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4182
4183                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4184                                         &xmm_mask_lo, &xmm_mask_hi);
4185
4186                 pix_multiply_2x128 (&xmm_src, &xmm_src,
4187                                     &xmm_mask_lo, &xmm_mask_hi,
4188                                     &xmm_mask_lo, &xmm_mask_hi);
4189
4190                 save_128_aligned (
4191                     (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
4192             }
4193             else
4194             {
4195                 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
4196             }
4197
4198             w -= 4;
4199             dst += 4;
4200             mask += 4;
4201         }
4202
4203         while (w)
4204         {
4205             uint8_t m = *mask++;
4206
4207             if (m)
4208             {
4209                 *dst = pack_1x64_32 (
4210                     pix_multiply_1x64 (
4211                         _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4212             }
4213             else
4214             {
4215                 *dst = 0;
4216             }
4217
4218             w--;
4219             dst++;
4220         }
4221     }
4222
4223     _mm_empty ();
4224 }
4225
4226 /*-----------------------------------------------------------------------
4227  * composite_over_n_8_0565
4228  */
4229
4230 static void
4231 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
4232                               pixman_op_t              op,
4233                               pixman_image_t *         src_image,
4234                               pixman_image_t *         mask_image,
4235                               pixman_image_t *         dst_image,
4236                               int32_t                  src_x,
4237                               int32_t                  src_y,
4238                               int32_t                  mask_x,
4239                               int32_t                  mask_y,
4240                               int32_t                  dest_x,
4241                               int32_t                  dest_y,
4242                               int32_t                  width,
4243                               int32_t                  height)
4244 {
4245     uint32_t src, srca;
4246     uint16_t    *dst_line, *dst, d;
4247     uint8_t     *mask_line, *mask;
4248     int dst_stride, mask_stride;
4249     uint16_t w;
4250     uint32_t m;
4251     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4252
4253     __m128i xmm_src, xmm_alpha;
4254     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4255     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4256
4257     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4258
4259     srca = src >> 24;
4260     if (src == 0)
4261         return;
4262
4263     PIXMAN_IMAGE_GET_LINE (
4264         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4265     PIXMAN_IMAGE_GET_LINE (
4266         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4267
4268     xmm_src = expand_pixel_32_1x128 (src);
4269     xmm_alpha = expand_alpha_1x128 (xmm_src);
4270     mmx_src = _mm_movepi64_pi64 (xmm_src);
4271     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4272
4273     while (height--)
4274     {
4275         dst = dst_line;
4276         dst_line += dst_stride;
4277         mask = mask_line;
4278         mask_line += mask_stride;
4279         w = width;
4280
4281         /* call prefetch hint to optimize cache load*/
4282         cache_prefetch ((__m128i*)mask);
4283         cache_prefetch ((__m128i*)dst);
4284
4285         while (w && (unsigned long)dst & 15)
4286         {
4287             m = *mask++;
4288
4289             if (m)
4290             {
4291                 d = *dst;
4292                 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4293                 mmx_dest = expand565_16_1x64 (d);
4294
4295                 *dst = pack_565_32_16 (
4296                     pack_1x64_32 (
4297                         in_over_1x64 (
4298                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4299             }
4300
4301             w--;
4302             dst++;
4303         }
4304
4305         /* call prefetch hint to optimize cache load*/
4306         cache_prefetch ((__m128i*)mask);
4307         cache_prefetch ((__m128i*)dst);
4308
4309         while (w >= 8)
4310         {
4311             /* fill cache line with next memory */
4312             cache_prefetch_next ((__m128i*)mask);
4313             cache_prefetch_next ((__m128i*)dst);
4314
4315             xmm_dst = load_128_aligned ((__m128i*) dst);
4316             unpack_565_128_4x128 (xmm_dst,
4317                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4318
4319             m = *((uint32_t*)mask);
4320             mask += 4;
4321
4322             if (m)
4323             {
4324                 xmm_mask = unpack_32_1x128 (m);
4325                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4326
4327                 /* Unpacking */
4328                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4329
4330                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4331                                         &xmm_mask_lo, &xmm_mask_hi);
4332
4333                 in_over_2x128 (&xmm_src, &xmm_src,
4334                                &xmm_alpha, &xmm_alpha,
4335                                &xmm_mask_lo, &xmm_mask_hi,
4336                                &xmm_dst0, &xmm_dst1);
4337             }
4338
4339             m = *((uint32_t*)mask);
4340             mask += 4;
4341
4342             if (m)
4343             {
4344                 xmm_mask = unpack_32_1x128 (m);
4345                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4346
4347                 /* Unpacking */
4348                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4349
4350                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4351                                         &xmm_mask_lo, &xmm_mask_hi);
4352                 in_over_2x128 (&xmm_src, &xmm_src,
4353                                &xmm_alpha, &xmm_alpha,
4354                                &xmm_mask_lo, &xmm_mask_hi,
4355                                &xmm_dst2, &xmm_dst3);
4356             }
4357
4358             save_128_aligned (
4359                 (__m128i*)dst, pack_565_4x128_128 (
4360                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4361
4362             w -= 8;
4363             dst += 8;
4364         }
4365
4366         while (w)
4367         {
4368             m = *mask++;
4369
4370             if (m)
4371             {
4372                 d = *dst;
4373                 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4374                 mmx_dest = expand565_16_1x64 (d);
4375
4376                 *dst = pack_565_32_16 (
4377                     pack_1x64_32 (
4378                         in_over_1x64 (
4379                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4380             }
4381
4382             w--;
4383             dst++;
4384         }
4385     }
4386
4387     _mm_empty ();
4388 }
4389
4390 /* -----------------------------------------------------------------------
4391  * composite_over_pixbuf_0565
4392  */
4393
4394 static void
4395 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
4396                                  pixman_op_t              op,
4397                                  pixman_image_t *         src_image,
4398                                  pixman_image_t *         mask_image,
4399                                  pixman_image_t *         dst_image,
4400                                  int32_t                  src_x,
4401                                  int32_t                  src_y,
4402                                  int32_t                  mask_x,
4403                                  int32_t                  mask_y,
4404                                  int32_t                  dest_x,
4405                                  int32_t                  dest_y,
4406                                  int32_t                  width,
4407                                  int32_t                  height)
4408 {
4409     uint16_t    *dst_line, *dst, d;
4410     uint32_t    *src_line, *src, s;
4411     int dst_stride, src_stride;
4412     uint16_t w;
4413     uint32_t opaque, zero;
4414
4415     __m64 ms;
4416     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4417     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4418
4419     PIXMAN_IMAGE_GET_LINE (
4420         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4421     PIXMAN_IMAGE_GET_LINE (
4422         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4423
4424 #if 0
4425     /* FIXME
4426      *
4427      * I copy the code from MMX one and keep the fixme.
4428      * If it's a problem there, probably is a problem here.
4429      */
4430     assert (src_image->drawable == mask_image->drawable);
4431 #endif
4432
4433     while (height--)
4434     {
4435         dst = dst_line;
4436         dst_line += dst_stride;
4437         src = src_line;
4438         src_line += src_stride;
4439         w = width;
4440
4441         /* call prefetch hint to optimize cache load*/
4442         cache_prefetch ((__m128i*)src);
4443         cache_prefetch ((__m128i*)dst);
4444
4445         while (w && (unsigned long)dst & 15)
4446         {
4447             s = *src++;
4448             d = *dst;
4449
4450             ms = unpack_32_1x64 (s);
4451
4452             *dst++ = pack_565_32_16 (
4453                 pack_1x64_32 (
4454                     over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4455             w--;
4456         }
4457
4458         /* call prefetch hint to optimize cache load*/
4459         cache_prefetch ((__m128i*)src);
4460         cache_prefetch ((__m128i*)dst);
4461
4462         while (w >= 8)
4463         {
4464             /* fill cache line with next memory */
4465             cache_prefetch_next ((__m128i*)src);
4466             cache_prefetch_next ((__m128i*)dst);
4467
4468             /* First round */
4469             xmm_src = load_128_unaligned ((__m128i*)src);
4470             xmm_dst = load_128_aligned  ((__m128i*)dst);
4471
4472             opaque = is_opaque (xmm_src);
4473             zero = is_zero (xmm_src);
4474
4475             unpack_565_128_4x128 (xmm_dst,
4476                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4477             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4478
4479             /* preload next round*/
4480             xmm_src = load_128_unaligned ((__m128i*)(src + 4));
4481
4482             if (opaque)
4483             {
4484                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4485                                      &xmm_dst0, &xmm_dst1);
4486             }
4487             else if (!zero)
4488             {
4489                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4490                                         &xmm_dst0, &xmm_dst1);
4491             }
4492
4493             /* Second round */
4494             opaque = is_opaque (xmm_src);
4495             zero = is_zero (xmm_src);
4496
4497             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4498
4499             if (opaque)
4500             {
4501                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4502                                      &xmm_dst2, &xmm_dst3);
4503             }
4504             else if (!zero)
4505             {
4506                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4507                                         &xmm_dst2, &xmm_dst3);
4508             }
4509
4510             save_128_aligned (
4511                 (__m128i*)dst, pack_565_4x128_128 (
4512                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4513
4514             w -= 8;
4515             src += 8;
4516             dst += 8;
4517         }
4518
4519         while (w)
4520         {
4521             s = *src++;
4522             d = *dst;
4523
4524             ms = unpack_32_1x64 (s);
4525
4526             *dst++ = pack_565_32_16 (
4527                 pack_1x64_32 (
4528                     over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4529             w--;
4530         }
4531     }
4532
4533     _mm_empty ();
4534 }
4535
4536 /* -------------------------------------------------------------------------
4537  * composite_over_pixbuf_8888
4538  */
4539
4540 static void
4541 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
4542                                  pixman_op_t              op,
4543                                  pixman_image_t *         src_image,
4544                                  pixman_image_t *         mask_image,
4545                                  pixman_image_t *         dst_image,
4546                                  int32_t                  src_x,
4547                                  int32_t                  src_y,
4548                                  int32_t                  mask_x,
4549                                  int32_t                  mask_y,
4550                                  int32_t                  dest_x,
4551                                  int32_t                  dest_y,
4552                                  int32_t                  width,
4553                                  int32_t                  height)
4554 {
4555     uint32_t    *dst_line, *dst, d;
4556     uint32_t    *src_line, *src, s;
4557     int dst_stride, src_stride;
4558     uint16_t w;
4559     uint32_t opaque, zero;
4560
4561     __m128i xmm_src_lo, xmm_src_hi;
4562     __m128i xmm_dst_lo, xmm_dst_hi;
4563
4564     PIXMAN_IMAGE_GET_LINE (
4565         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4566     PIXMAN_IMAGE_GET_LINE (
4567         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4568
4569 #if 0
4570     /* FIXME
4571      *
4572      * I copy the code from MMX one and keep the fixme.
4573      * If it's a problem there, probably is a problem here.
4574      */
4575     assert (src_image->drawable == mask_image->drawable);
4576 #endif
4577
4578     while (height--)
4579     {
4580         dst = dst_line;
4581         dst_line += dst_stride;
4582         src = src_line;
4583         src_line += src_stride;
4584         w = width;
4585
4586         /* call prefetch hint to optimize cache load*/
4587         cache_prefetch ((__m128i*)src);
4588         cache_prefetch ((__m128i*)dst);
4589
4590         while (w && (unsigned long)dst & 15)
4591         {
4592             s = *src++;
4593             d = *dst;
4594
4595             *dst++ = pack_1x64_32 (
4596                 over_rev_non_pre_1x64 (
4597                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4598
4599             w--;
4600         }
4601
4602         /* call prefetch hint to optimize cache load*/
4603         cache_prefetch ((__m128i*)src);
4604         cache_prefetch ((__m128i*)dst);
4605
4606         while (w >= 4)
4607         {
4608             /* fill cache line with next memory */
4609             cache_prefetch_next ((__m128i*)src);
4610             cache_prefetch_next ((__m128i*)dst);
4611
4612             xmm_src_hi = load_128_unaligned ((__m128i*)src);
4613
4614             opaque = is_opaque (xmm_src_hi);
4615             zero = is_zero (xmm_src_hi);
4616
4617             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
4618
4619             if (opaque)
4620             {
4621                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4622                                      &xmm_dst_lo, &xmm_dst_hi);
4623
4624                 save_128_aligned (
4625                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4626             }
4627             else if (!zero)
4628             {
4629                 xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
4630
4631                 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4632
4633                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4634                                         &xmm_dst_lo, &xmm_dst_hi);
4635
4636                 save_128_aligned (
4637                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4638             }
4639
4640             w -= 4;
4641             dst += 4;
4642             src += 4;
4643         }
4644
4645         while (w)
4646         {
4647             s = *src++;
4648             d = *dst;
4649
4650             *dst++ = pack_1x64_32 (
4651                 over_rev_non_pre_1x64 (
4652                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4653
4654             w--;
4655         }
4656     }
4657
4658     _mm_empty ();
4659 }
4660
4661 /* -------------------------------------------------------------------------------------------------
4662  * composite_over_n_8888_0565_ca
4663  */
4664
4665 static void
4666 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
4667                                     pixman_op_t              op,
4668                                     pixman_image_t *         src_image,
4669                                     pixman_image_t *         mask_image,
4670                                     pixman_image_t *         dst_image,
4671                                     int32_t                  src_x,
4672                                     int32_t                  src_y,
4673                                     int32_t                  mask_x,
4674                                     int32_t                  mask_y,
4675                                     int32_t                  dest_x,
4676                                     int32_t                  dest_y,
4677                                     int32_t                  width,
4678                                     int32_t                  height)
4679 {
4680     uint32_t src;
4681     uint16_t    *dst_line, *dst, d;
4682     uint32_t    *mask_line, *mask, m;
4683     int dst_stride, mask_stride;
4684     int w;
4685     uint32_t pack_cmp;
4686
4687     __m128i xmm_src, xmm_alpha;
4688     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4689     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4690
4691     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4692
4693     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4694
4695     if (src == 0)
4696         return;
4697
4698     PIXMAN_IMAGE_GET_LINE (
4699         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4700     PIXMAN_IMAGE_GET_LINE (
4701         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4702
4703     xmm_src = expand_pixel_32_1x128 (src);
4704     xmm_alpha = expand_alpha_1x128 (xmm_src);
4705     mmx_src = _mm_movepi64_pi64 (xmm_src);
4706     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4707
4708     while (height--)
4709     {
4710         w = width;
4711         mask = mask_line;
4712         dst = dst_line;
4713         mask_line += mask_stride;
4714         dst_line += dst_stride;
4715
4716         /* call prefetch hint to optimize cache load*/
4717         cache_prefetch ((__m128i*)mask);
4718         cache_prefetch ((__m128i*)dst);
4719
4720         while (w && ((unsigned long)dst & 15))
4721         {
4722             m = *(uint32_t *) mask;
4723
4724             if (m)
4725             {
4726                 d = *dst;
4727                 mmx_mask = unpack_32_1x64 (m);
4728                 mmx_dest = expand565_16_1x64 (d);
4729
4730                 *dst = pack_565_32_16 (
4731                     pack_1x64_32 (
4732                         in_over_1x64 (
4733                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4734             }
4735
4736             w--;
4737             dst++;
4738             mask++;
4739         }
4740
4741         /* call prefetch hint to optimize cache load*/
4742         cache_prefetch ((__m128i*)mask);
4743         cache_prefetch ((__m128i*)dst);
4744
4745         while (w >= 8)
4746         {
4747             /* fill cache line with next memory */
4748             cache_prefetch_next ((__m128i*)mask);
4749             cache_prefetch_next ((__m128i*)dst);
4750
4751             /* First round */
4752             xmm_mask = load_128_unaligned ((__m128i*)mask);
4753             xmm_dst = load_128_aligned ((__m128i*)dst);
4754
4755             pack_cmp = _mm_movemask_epi8 (
4756                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4757
4758             unpack_565_128_4x128 (xmm_dst,
4759                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4760             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4761
4762             /* preload next round */
4763             xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4764
4765             /* preload next round */
4766             if (pack_cmp != 0xffff)
4767             {
4768                 in_over_2x128 (&xmm_src, &xmm_src,
4769                                &xmm_alpha, &xmm_alpha,
4770                                &xmm_mask_lo, &xmm_mask_hi,
4771                                &xmm_dst0, &xmm_dst1);
4772             }
4773
4774             /* Second round */
4775             pack_cmp = _mm_movemask_epi8 (
4776                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4777
4778             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4779
4780             if (pack_cmp != 0xffff)
4781             {
4782                 in_over_2x128 (&xmm_src, &xmm_src,
4783                                &xmm_alpha, &xmm_alpha,
4784                                &xmm_mask_lo, &xmm_mask_hi,
4785                                &xmm_dst2, &xmm_dst3);
4786             }
4787
4788             save_128_aligned (
4789                 (__m128i*)dst, pack_565_4x128_128 (
4790                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4791
4792             w -= 8;
4793             dst += 8;
4794             mask += 8;
4795         }
4796
4797         while (w)
4798         {
4799             m = *(uint32_t *) mask;
4800
4801             if (m)
4802             {
4803                 d = *dst;
4804                 mmx_mask = unpack_32_1x64 (m);
4805                 mmx_dest = expand565_16_1x64 (d);
4806
4807                 *dst = pack_565_32_16 (
4808                     pack_1x64_32 (
4809                         in_over_1x64 (
4810                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4811             }
4812
4813             w--;
4814             dst++;
4815             mask++;
4816         }
4817     }
4818
4819     _mm_empty ();
4820 }
4821
4822 /* -----------------------------------------------------------------------
4823  * composite_in_n_8_8
4824  */
4825
4826 static void
4827 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4828                          pixman_op_t              op,
4829                          pixman_image_t *         src_image,
4830                          pixman_image_t *         mask_image,
4831                          pixman_image_t *         dst_image,
4832                          int32_t                  src_x,
4833                          int32_t                  src_y,
4834                          int32_t                  mask_x,
4835                          int32_t                  mask_y,
4836                          int32_t                  dest_x,
4837                          int32_t                  dest_y,
4838                          int32_t                  width,
4839                          int32_t                  height)
4840 {
4841     uint8_t     *dst_line, *dst;
4842     uint8_t     *mask_line, *mask;
4843     int dst_stride, mask_stride;
4844     uint16_t w, d, m;
4845     uint32_t src;
4846     uint8_t sa;
4847
4848     __m128i xmm_alpha;
4849     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4850     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4851
4852     PIXMAN_IMAGE_GET_LINE (
4853         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4854     PIXMAN_IMAGE_GET_LINE (
4855         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4856
4857     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4858
4859     sa = src >> 24;
4860
4861     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4862
4863     while (height--)
4864     {
4865         dst = dst_line;
4866         dst_line += dst_stride;
4867         mask = mask_line;
4868         mask_line += mask_stride;
4869         w = width;
4870
4871         /* call prefetch hint to optimize cache load*/
4872         cache_prefetch ((__m128i*)mask);
4873         cache_prefetch ((__m128i*)dst);
4874
4875         while (w && ((unsigned long)dst & 15))
4876         {
4877             m = (uint32_t) *mask++;
4878             d = (uint32_t) *dst;
4879
4880             *dst++ = (uint8_t) pack_1x64_32 (
4881                 pix_multiply_1x64 (
4882                     pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
4883                                        unpack_32_1x64 (m)),
4884                     unpack_32_1x64 (d)));
4885             w--;
4886         }
4887
4888         /* call prefetch hint to optimize cache load*/
4889         cache_prefetch ((__m128i*)mask);
4890         cache_prefetch ((__m128i*)dst);
4891
4892         while (w >= 16)
4893         {
4894             /* fill cache line with next memory */
4895             cache_prefetch_next ((__m128i*)mask);
4896             cache_prefetch_next ((__m128i*)dst);
4897
4898             xmm_mask = load_128_unaligned ((__m128i*)mask);
4899             xmm_dst = load_128_aligned ((__m128i*)dst);
4900
4901             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4902             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4903
4904             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4905                                 &xmm_mask_lo, &xmm_mask_hi,
4906                                 &xmm_mask_lo, &xmm_mask_hi);
4907
4908             pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4909                                 &xmm_dst_lo, &xmm_dst_hi,
4910                                 &xmm_dst_lo, &xmm_dst_hi);
4911
4912             save_128_aligned (
4913                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4914
4915             mask += 16;
4916             dst += 16;
4917             w -= 16;
4918         }
4919
4920         while (w)
4921         {
4922             m = (uint32_t) *mask++;
4923             d = (uint32_t) *dst;
4924
4925             *dst++ = (uint8_t) pack_1x64_32 (
4926                 pix_multiply_1x64 (
4927                     pix_multiply_1x64 (
4928                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4929                     unpack_32_1x64 (d)));
4930             w--;
4931         }
4932     }
4933
4934     _mm_empty ();
4935 }
4936
4937 /* ---------------------------------------------------------------------------
4938  * composite_in_8_8
4939  */
4940
4941 static void
4942 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4943                        pixman_op_t              op,
4944                        pixman_image_t *         src_image,
4945                        pixman_image_t *         mask_image,
4946                        pixman_image_t *         dst_image,
4947                        int32_t                  src_x,
4948                        int32_t                  src_y,
4949                        int32_t                  mask_x,
4950                        int32_t                  mask_y,
4951                        int32_t                  dest_x,
4952                        int32_t                  dest_y,
4953                        int32_t                  width,
4954                        int32_t                  height)
4955 {
4956     uint8_t     *dst_line, *dst;
4957     uint8_t     *src_line, *src;
4958     int src_stride, dst_stride;
4959     uint16_t w;
4960     uint32_t s, d;
4961
4962     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4963     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4964
4965     PIXMAN_IMAGE_GET_LINE (
4966         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4967     PIXMAN_IMAGE_GET_LINE (
4968         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4969
4970     while (height--)
4971     {
4972         dst = dst_line;
4973         dst_line += dst_stride;
4974         src = src_line;
4975         src_line += src_stride;
4976         w = width;
4977
4978         /* call prefetch hint to optimize cache load*/
4979         cache_prefetch ((__m128i*)src);
4980         cache_prefetch ((__m128i*)dst);
4981
4982         while (w && ((unsigned long)dst & 15))
4983         {
4984             s = (uint32_t) *src++;
4985             d = (uint32_t) *dst;
4986
4987             *dst++ = (uint8_t) pack_1x64_32 (
4988                 pix_multiply_1x64 (
4989                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4990             w--;
4991         }
4992
4993         /* call prefetch hint to optimize cache load*/
4994         cache_prefetch ((__m128i*)src);
4995         cache_prefetch ((__m128i*)dst);
4996
4997         while (w >= 16)
4998         {
4999             /* fill cache line with next memory */
5000             cache_prefetch_next ((__m128i*)src);
5001             cache_prefetch_next ((__m128i*)dst);
5002
5003             xmm_src = load_128_unaligned ((__m128i*)src);
5004             xmm_dst = load_128_aligned ((__m128i*)dst);
5005
5006             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5007             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5008
5009             pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
5010                                 &xmm_dst_lo, &xmm_dst_hi,
5011                                 &xmm_dst_lo, &xmm_dst_hi);
5012
5013             save_128_aligned (
5014                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5015
5016             src += 16;
5017             dst += 16;
5018             w -= 16;
5019         }
5020
5021         while (w)
5022         {
5023             s = (uint32_t) *src++;
5024             d = (uint32_t) *dst;
5025
5026             *dst++ = (uint8_t) pack_1x64_32 (
5027                 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
5028             w--;
5029         }
5030     }
5031
5032     _mm_empty ();
5033 }
5034
5035 /* -------------------------------------------------------------------------
5036  * composite_add_8888_8_8
5037  */
5038
5039 static void
5040 sse2_composite_add_8888_8_8 (pixman_implementation_t *imp,
5041                              pixman_op_t              op,
5042                              pixman_image_t *         src_image,
5043                              pixman_image_t *         mask_image,
5044                              pixman_image_t *         dst_image,
5045                              int32_t                  src_x,
5046                              int32_t                  src_y,
5047                              int32_t                  mask_x,
5048                              int32_t                  mask_y,
5049                              int32_t                  dest_x,
5050                              int32_t                  dest_y,
5051                              int32_t                  width,
5052                              int32_t                  height)
5053 {
5054     uint8_t     *dst_line, *dst;
5055     uint8_t     *mask_line, *mask;
5056     int dst_stride, mask_stride;
5057     uint16_t w;
5058     uint32_t src;
5059     uint8_t sa;
5060     uint32_t m, d;
5061
5062     __m128i xmm_alpha;
5063     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5064     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5065
5066     PIXMAN_IMAGE_GET_LINE (
5067         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5068     PIXMAN_IMAGE_GET_LINE (
5069         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5070
5071     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
5072
5073     sa = src >> 24;
5074
5075     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
5076
5077     while (height--)
5078     {
5079         dst = dst_line;
5080         dst_line += dst_stride;
5081         mask = mask_line;
5082         mask_line += mask_stride;
5083         w = width;
5084
5085         /* call prefetch hint to optimize cache load*/
5086         cache_prefetch ((__m128i*)mask);
5087         cache_prefetch ((__m128i*)dst);
5088
5089         while (w && ((unsigned long)dst & 15))
5090         {
5091             m = (uint32_t) *mask++;
5092             d = (uint32_t) *dst;
5093
5094             *dst++ = (uint8_t) pack_1x64_32 (
5095                 _mm_adds_pu16 (
5096                     pix_multiply_1x64 (
5097                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5098                     unpack_32_1x64 (d)));
5099             w--;
5100         }
5101
5102         /* call prefetch hint to optimize cache load*/
5103         cache_prefetch ((__m128i*)mask);
5104         cache_prefetch ((__m128i*)dst);
5105
5106         while (w >= 16)
5107         {
5108             /* fill cache line with next memory */
5109             cache_prefetch_next ((__m128i*)mask);
5110             cache_prefetch_next ((__m128i*)dst);
5111
5112             xmm_mask = load_128_unaligned ((__m128i*)mask);
5113             xmm_dst = load_128_aligned ((__m128i*)dst);
5114
5115             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5116             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5117
5118             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
5119                                 &xmm_mask_lo, &xmm_mask_hi,
5120                                 &xmm_mask_lo, &xmm_mask_hi);
5121
5122             xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
5123             xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
5124
5125             save_128_aligned (
5126                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5127
5128             mask += 16;
5129             dst += 16;
5130             w -= 16;
5131         }
5132
5133         while (w)
5134         {
5135             m = (uint32_t) *mask++;
5136             d = (uint32_t) *dst;
5137
5138             *dst++ = (uint8_t) pack_1x64_32 (
5139                 _mm_adds_pu16 (
5140                     pix_multiply_1x64 (
5141                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5142                     unpack_32_1x64 (d)));
5143
5144             w--;
5145         }
5146     }
5147
5148     _mm_empty ();
5149 }
5150
5151 /* ----------------------------------------------------------------------
5152  * composite_add_8000_8000
5153  */
5154
5155 static void
5156 sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
5157                               pixman_op_t              op,
5158                               pixman_image_t *         src_image,
5159                               pixman_image_t *         mask_image,
5160                               pixman_image_t *         dst_image,
5161                               int32_t                  src_x,
5162                               int32_t                  src_y,
5163                               int32_t                  mask_x,
5164                               int32_t                  mask_y,
5165                               int32_t                  dest_x,
5166                               int32_t                  dest_y,
5167                               int32_t                  width,
5168                               int32_t                  height)
5169 {
5170     uint8_t     *dst_line, *dst;
5171     uint8_t     *src_line, *src;
5172     int dst_stride, src_stride;
5173     uint16_t w;
5174     uint16_t t;
5175
5176     PIXMAN_IMAGE_GET_LINE (
5177         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
5178     PIXMAN_IMAGE_GET_LINE (
5179         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5180
5181     while (height--)
5182     {
5183         dst = dst_line;
5184         src = src_line;
5185
5186         /* call prefetch hint to optimize cache load*/
5187         cache_prefetch ((__m128i*)src);
5188         cache_prefetch ((__m128i*)dst);
5189
5190         dst_line += dst_stride;
5191         src_line += src_stride;
5192         w = width;
5193
5194         /* Small head */
5195         while (w && (unsigned long)dst & 3)
5196         {
5197             t = (*dst) + (*src++);
5198             *dst++ = t | (0 - (t >> 8));
5199             w--;
5200         }
5201
5202         core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
5203
5204         /* Small tail */
5205         dst += w & 0xfffc;
5206         src += w & 0xfffc;
5207
5208         w &= 3;
5209
5210         while (w)
5211         {
5212             t = (*dst) + (*src++);
5213             *dst++ = t | (0 - (t >> 8));
5214             w--;
5215         }
5216     }
5217
5218     _mm_empty ();
5219 }
5220
5221 /* ---------------------------------------------------------------------
5222  * composite_add_8888_8888
5223  */
5224 static void
5225 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
5226                               pixman_op_t              op,
5227                               pixman_image_t *         src_image,
5228                               pixman_image_t *         mask_image,
5229                               pixman_image_t *         dst_image,
5230                               int32_t                  src_x,
5231                               int32_t                  src_y,
5232                               int32_t                  mask_x,
5233                               int32_t                  mask_y,
5234                               int32_t                  dest_x,
5235                               int32_t                  dest_y,
5236                               int32_t                  width,
5237                               int32_t                  height)
5238 {
5239     uint32_t    *dst_line, *dst;
5240     uint32_t    *src_line, *src;
5241     int dst_stride, src_stride;
5242
5243     PIXMAN_IMAGE_GET_LINE (
5244         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5245     PIXMAN_IMAGE_GET_LINE (
5246         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5247
5248     while (height--)
5249     {
5250         dst = dst_line;
5251         dst_line += dst_stride;
5252         src = src_line;
5253         src_line += src_stride;
5254
5255         core_combine_add_u_sse2 (dst, src, NULL, width);
5256     }
5257
5258     _mm_empty ();
5259 }
5260
5261 /* -------------------------------------------------------------------------------------------------
5262  * sse2_composite_copy_area
5263  */
5264
5265 static pixman_bool_t
5266 pixman_blt_sse2 (uint32_t *src_bits,
5267                  uint32_t *dst_bits,
5268                  int       src_stride,
5269                  int       dst_stride,
5270                  int       src_bpp,
5271                  int       dst_bpp,
5272                  int       src_x,
5273                  int       src_y,
5274                  int       dst_x,
5275                  int       dst_y,
5276                  int       width,
5277                  int       height)
5278 {
5279     uint8_t *   src_bytes;
5280     uint8_t *   dst_bytes;
5281     int byte_width;
5282
5283     if (src_bpp != dst_bpp)
5284         return FALSE;
5285
5286     if (src_bpp == 16)
5287     {
5288         src_stride = src_stride * (int) sizeof (uint32_t) / 2;
5289         dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
5290         src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
5291         dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5292         byte_width = 2 * width;
5293         src_stride *= 2;
5294         dst_stride *= 2;
5295     }
5296     else if (src_bpp == 32)
5297     {
5298         src_stride = src_stride * (int) sizeof (uint32_t) / 4;
5299         dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
5300         src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
5301         dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5302         byte_width = 4 * width;
5303         src_stride *= 4;
5304         dst_stride *= 4;
5305     }
5306     else
5307     {
5308         return FALSE;
5309     }
5310
5311     cache_prefetch ((__m128i*)src_bytes);
5312     cache_prefetch ((__m128i*)dst_bytes);
5313
5314     while (height--)
5315     {
5316         int w;
5317         uint8_t *s = src_bytes;
5318         uint8_t *d = dst_bytes;
5319         src_bytes += src_stride;
5320         dst_bytes += dst_stride;
5321         w = byte_width;
5322
5323         cache_prefetch_next ((__m128i*)s);
5324         cache_prefetch_next ((__m128i*)d);
5325
5326         while (w >= 2 && ((unsigned long)d & 3))
5327         {
5328             *(uint16_t *)d = *(uint16_t *)s;
5329             w -= 2;
5330             s += 2;
5331             d += 2;
5332         }
5333
5334         while (w >= 4 && ((unsigned long)d & 15))
5335         {
5336             *(uint32_t *)d = *(uint32_t *)s;
5337
5338             w -= 4;
5339             s += 4;
5340             d += 4;
5341         }
5342
5343         cache_prefetch_next ((__m128i*)s);
5344         cache_prefetch_next ((__m128i*)d);
5345
5346         while (w >= 64)
5347         {
5348             __m128i xmm0, xmm1, xmm2, xmm3;
5349
5350             /* 128 bytes ahead */
5351             cache_prefetch (((__m128i*)s) + 8);
5352             cache_prefetch (((__m128i*)d) + 8);
5353
5354             xmm0 = load_128_unaligned ((__m128i*)(s));
5355             xmm1 = load_128_unaligned ((__m128i*)(s + 16));
5356             xmm2 = load_128_unaligned ((__m128i*)(s + 32));
5357             xmm3 = load_128_unaligned ((__m128i*)(s + 48));
5358
5359             save_128_aligned ((__m128i*)(d),    xmm0);
5360             save_128_aligned ((__m128i*)(d + 16), xmm1);
5361             save_128_aligned ((__m128i*)(d + 32), xmm2);
5362             save_128_aligned ((__m128i*)(d + 48), xmm3);
5363
5364             s += 64;
5365             d += 64;
5366             w -= 64;
5367         }
5368
5369         cache_prefetch_next ((__m128i*)s);
5370         cache_prefetch_next ((__m128i*)d);
5371
5372         while (w >= 16)
5373         {
5374             save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
5375
5376             w -= 16;
5377             d += 16;
5378             s += 16;
5379         }
5380
5381         cache_prefetch_next ((__m128i*)s);
5382         cache_prefetch_next ((__m128i*)d);
5383
5384         while (w >= 4)
5385         {
5386             *(uint32_t *)d = *(uint32_t *)s;
5387
5388             w -= 4;
5389             s += 4;
5390             d += 4;
5391         }
5392
5393         if (w >= 2)
5394         {
5395             *(uint16_t *)d = *(uint16_t *)s;
5396             w -= 2;
5397             s += 2;
5398             d += 2;
5399         }
5400     }
5401
5402     _mm_empty ();
5403
5404     return TRUE;
5405 }
5406
5407 static void
5408 sse2_composite_copy_area (pixman_implementation_t *imp,
5409                           pixman_op_t              op,
5410                           pixman_image_t *         src_image,
5411                           pixman_image_t *         mask_image,
5412                           pixman_image_t *         dst_image,
5413                           int32_t                  src_x,
5414                           int32_t                  src_y,
5415                           int32_t                  mask_x,
5416                           int32_t                  mask_y,
5417                           int32_t                  dest_x,
5418                           int32_t                  dest_y,
5419                           int32_t                  width,
5420                           int32_t                  height)
5421 {
5422     pixman_blt_sse2 (src_image->bits.bits,
5423                      dst_image->bits.bits,
5424                      src_image->bits.rowstride,
5425                      dst_image->bits.rowstride,
5426                      PIXMAN_FORMAT_BPP (src_image->bits.format),
5427                      PIXMAN_FORMAT_BPP (dst_image->bits.format),
5428                      src_x, src_y, dest_x, dest_y, width, height);
5429 }
5430
5431 #if 0
5432 /* This code are buggy in MMX version, now the bug was translated to SSE2 version */
5433 void
5434 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
5435                                  pixman_op_t              op,
5436                                  pixman_image_t *         src_image,
5437                                  pixman_image_t *         mask_image,
5438                                  pixman_image_t *         dst_image,
5439                                  int32_t                  src_x,
5440                                  int32_t                  src_y,
5441                                  int32_t                  mask_x,
5442                                  int32_t                  mask_y,
5443                                  int32_t                  dest_x,
5444                                  int32_t                  dest_y,
5445                                  int32_t                  width,
5446                                  int32_t                  height)
5447 {
5448     uint32_t    *src, *src_line, s;
5449     uint32_t    *dst, *dst_line, d;
5450     uint8_t         *mask, *mask_line;
5451     uint32_t m;
5452     int src_stride, mask_stride, dst_stride;
5453     uint16_t w;
5454
5455     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5456     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5457     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5458
5459     PIXMAN_IMAGE_GET_LINE (
5460         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5461     PIXMAN_IMAGE_GET_LINE (
5462         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5463     PIXMAN_IMAGE_GET_LINE (
5464         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5465
5466     while (height--)
5467     {
5468         src = src_line;
5469         src_line += src_stride;
5470         dst = dst_line;
5471         dst_line += dst_stride;
5472         mask = mask_line;
5473         mask_line += mask_stride;
5474
5475         w = width;
5476
5477         /* call prefetch hint to optimize cache load*/
5478         cache_prefetch ((__m128i*)src);
5479         cache_prefetch ((__m128i*)dst);
5480         cache_prefetch ((__m128i*)mask);
5481
5482         while (w && (unsigned long)dst & 15)
5483         {
5484             s = 0xff000000 | *src++;
5485             m = (uint32_t) *mask++;
5486             d = *dst;
5487
5488             __m64 ms = unpack_32_1x64 (s);
5489
5490             if (m != 0xff)
5491             {
5492                 ms = in_over_1x64 (ms,
5493                                    mask_x00ff,
5494                                    expand_alpha_rev_1x64 (unpack_32_1x64 (m)),
5495                                    unpack_32_1x64 (d));
5496             }
5497
5498             *dst++ = pack_1x64_32 (ms);
5499             w--;
5500         }
5501
5502         /* call prefetch hint to optimize cache load*/
5503         cache_prefetch ((__m128i*)src);
5504         cache_prefetch ((__m128i*)dst);
5505         cache_prefetch ((__m128i*)mask);
5506
5507         while (w >= 4)
5508         {
5509             /* fill cache line with next memory */
5510             cache_prefetch_next ((__m128i*)src);
5511             cache_prefetch_next ((__m128i*)dst);
5512             cache_prefetch_next ((__m128i*)mask);
5513
5514             m = *(uint32_t*) mask;
5515             xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
5516
5517             if (m == 0xffffffff)
5518             {
5519                 save_128_aligned ((__m128i*)dst, xmm_src);
5520             }
5521             else
5522             {
5523                 xmm_dst = load_128_aligned ((__m128i*)dst);
5524
5525                 xmm_mask = _mm_unpacklo_epi16 (
5526                     unpack_32_1x128 (m), _mm_setzero_si128 ());
5527
5528                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5529                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5530                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5531
5532                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
5533                                         &xmm_mask_lo, &xmm_mask_hi);
5534
5535                 in_over_2x128 (xmm_src_lo, xmm_src_hi,
5536                                mask_00ff, mask_00ff,
5537                                xmm_mask_lo, xmm_mask_hi,
5538                                &xmm_dst_lo, &xmm_dst_hi);
5539
5540                 save_128_aligned (
5541                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5542             }
5543
5544             src += 4;
5545             dst += 4;
5546             mask += 4;
5547             w -= 4;
5548         }
5549
5550         while (w)
5551         {
5552             m = (uint32_t) *mask++;
5553
5554             if (m)
5555             {
5556                 s = 0xff000000 | *src;
5557
5558                 if (m == 0xff)
5559                 {
5560                     *dst = s;
5561                 }
5562                 else
5563                 {
5564                     d = *dst;
5565
5566                     *dst = pack_1x64_32 (
5567                         in_over_1x64 (
5568                             unpack_32_1x64 (s),
5569                             mask_x00ff,
5570                             expand_alpha_rev_1x64 (unpack_32_1x64 (m)),
5571                             unpack_32_1x64 (d)));
5572                 }
5573
5574             }
5575
5576             src++;
5577             dst++;
5578             w--;
5579         }
5580     }
5581
5582     _mm_empty ();
5583 }
5584
5585 #endif
5586
5587 static const pixman_fast_path_t sse2_fast_paths[] =
5588 {
5589     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   sse2_composite_over_n_8_0565,       0 },
5590     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   sse2_composite_over_n_8_0565,       0 },
5591     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_over_n_8888,         0 },
5592     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_over_n_8888,         0 },
5593     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_r5g6b5,   sse2_composite_over_n_0565,         0 },
5594     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_over_8888_8888,      0 },
5595     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_over_8888_8888,      0 },
5596     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, sse2_composite_over_8888_8888,      0 },
5597     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_over_8888_8888,      0 },
5598     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   sse2_composite_over_8888_0565,      0 },
5599     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   sse2_composite_over_8888_0565,      0 },
5600     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_n_8_8888,       0 },
5601     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_n_8_8888,       0 },
5602     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_n_8_8888,       0 },
5603     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_n_8_8888,       0 },
5604 #if 0
5605     /* FIXME: This code are buggy in MMX version, now the bug was translated to SSE2 version */
5606     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_x888_8_8888,    0 },
5607     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888,    0 },
5608     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_x888_8_8888,    0 },
5609     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_x888_8_8888,    0 },
5610 #endif
5611     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
5612     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
5613     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
5614     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
5615     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
5616     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
5617     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
5618     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
5619     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5620     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5621     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5622     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5623     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_r5g6b5,   sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
5624     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_b5g6r5,   sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
5625     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5626     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5627     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5628     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5629     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5630     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5631     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5632     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5633     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
5634     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
5635     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
5636     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
5637     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_copy_area,           0 },
5638     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_copy_area,           0 },
5639
5640     { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_add_n_8888_8888_ca,  NEED_COMPONENT_ALPHA },
5641     { PIXMAN_OP_ADD,  PIXMAN_a8,       PIXMAN_null,     PIXMAN_a8,       sse2_composite_add_8000_8000,       0 },
5642     { PIXMAN_OP_ADD,  PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_add_8888_8888,       0 },
5643     { PIXMAN_OP_ADD,  PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, sse2_composite_add_8888_8888,       0 },
5644     { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8,       sse2_composite_add_8888_8_8,        0 },
5645
5646     { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_src_n_8_8888,        0 },
5647     { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_src_n_8_8888,        0 },
5648     { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_src_n_8_8888,        0 },
5649     { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_src_n_8_8888,        0 },
5650     { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_copy_area,           0 },
5651     { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_a8b8g8r8, sse2_composite_copy_area,           0 },
5652     { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_copy_area,           0 },
5653     { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_copy_area,           0 },
5654     { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_copy_area,           0 },
5655     { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8,  PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_copy_area,           0 },
5656     { PIXMAN_OP_SRC, PIXMAN_r5g6b5,    PIXMAN_null,     PIXMAN_r5g6b5,   sse2_composite_copy_area,           0 },
5657     { PIXMAN_OP_SRC, PIXMAN_b5g6r5,    PIXMAN_null,     PIXMAN_b5g6r5,   sse2_composite_copy_area,           0 },
5658
5659     { PIXMAN_OP_IN,  PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       sse2_composite_in_8_8,              0 },
5660     { PIXMAN_OP_IN,  PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8,       sse2_composite_in_n_8_8,            0 },
5661
5662     { PIXMAN_OP_NONE },
5663 };
5664
5665 /*
5666  * Work around GCC bug causing crashes in Mozilla with SSE2
5667  *
5668  * When using -msse, gcc generates movdqa instructions assuming that
5669  * the stack is 16 byte aligned. Unfortunately some applications, such
5670  * as Mozilla and Mono, end up aligning the stack to 4 bytes, which
5671  * causes the movdqa instructions to fail.
5672  *
5673  * The __force_align_arg_pointer__ makes gcc generate a prologue that
5674  * realigns the stack pointer to 16 bytes.
5675  *
5676  * On x86-64 this is not necessary because the standard ABI already
5677  * calls for a 16 byte aligned stack.
5678  *
5679  * See https://bugs.freedesktop.org/show_bug.cgi?id=15693
5680  */
5681 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5682 __attribute__((__force_align_arg_pointer__))
5683 #endif
5684 static void
5685 sse2_composite (pixman_implementation_t *imp,
5686                 pixman_op_t              op,
5687                 pixman_image_t *         src,
5688                 pixman_image_t *         mask,
5689                 pixman_image_t *         dest,
5690                 int32_t                  src_x,
5691                 int32_t                  src_y,
5692                 int32_t                  mask_x,
5693                 int32_t                  mask_y,
5694                 int32_t                  dest_x,
5695                 int32_t                  dest_y,
5696                 int32_t                  width,
5697                 int32_t                  height)
5698 {
5699     if (_pixman_run_fast_path (sse2_fast_paths, imp,
5700                                op, src, mask, dest,
5701                                src_x, src_y,
5702                                mask_x, mask_y,
5703                                dest_x, dest_y,
5704                                width, height))
5705     {
5706         return;
5707     }
5708
5709     _pixman_implementation_composite (imp->delegate, op,
5710                                       src, mask, dest,
5711                                       src_x, src_y,
5712                                       mask_x, mask_y,
5713                                       dest_x, dest_y,
5714                                       width, height);
5715 }
5716
5717 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5718 __attribute__((__force_align_arg_pointer__))
5719 #endif
5720 static pixman_bool_t
5721 sse2_blt (pixman_implementation_t *imp,
5722           uint32_t *               src_bits,
5723           uint32_t *               dst_bits,
5724           int                      src_stride,
5725           int                      dst_stride,
5726           int                      src_bpp,
5727           int                      dst_bpp,
5728           int                      src_x,
5729           int                      src_y,
5730           int                      dst_x,
5731           int                      dst_y,
5732           int                      width,
5733           int                      height)
5734 {
5735     if (!pixman_blt_sse2 (
5736             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5737             src_x, src_y, dst_x, dst_y, width, height))
5738
5739     {
5740         return _pixman_implementation_blt (
5741             imp->delegate,
5742             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5743             src_x, src_y, dst_x, dst_y, width, height);
5744     }
5745
5746     return TRUE;
5747 }
5748
5749 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5750 __attribute__((__force_align_arg_pointer__))
5751 #endif
5752 static pixman_bool_t
5753 sse2_fill (pixman_implementation_t *imp,
5754            uint32_t *               bits,
5755            int                      stride,
5756            int                      bpp,
5757            int                      x,
5758            int                      y,
5759            int                      width,
5760            int                      height,
5761            uint32_t xor)
5762 {
5763     if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
5764     {
5765         return _pixman_implementation_fill (
5766             imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5767     }
5768
5769     return TRUE;
5770 }
5771
5772 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5773 __attribute__((__force_align_arg_pointer__))
5774 #endif
5775 pixman_implementation_t *
5776 _pixman_implementation_create_sse2 (void)
5777 {
5778     pixman_implementation_t *mmx = _pixman_implementation_create_mmx ();
5779     pixman_implementation_t *imp = _pixman_implementation_create (mmx);
5780
5781     /* SSE2 constants */
5782     mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5783     mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
5784     mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
5785     mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
5786     mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5787     mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
5788     mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
5789     mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
5790     mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
5791     mask_0080 = create_mask_16_128 (0x0080);
5792     mask_00ff = create_mask_16_128 (0x00ff);
5793     mask_0101 = create_mask_16_128 (0x0101);
5794     mask_ffff = create_mask_16_128 (0xffff);
5795     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
5796     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
5797
5798     /* MMX constants */
5799     mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
5800     mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
5801
5802     mask_x0080 = create_mask_16_64 (0x0080);
5803     mask_x00ff = create_mask_16_64 (0x00ff);
5804     mask_x0101 = create_mask_16_64 (0x0101);
5805     mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
5806
5807     _mm_empty ();
5808
5809     /* Set up function pointers */
5810
5811     /* SSE code patch for fbcompose.c */
5812     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
5813     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
5814     imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
5815     imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
5816     imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
5817     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
5818     imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
5819     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
5820     imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
5821     imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
5822
5823     imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
5824
5825     imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
5826     imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
5827     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
5828     imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
5829     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
5830     imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
5831     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
5832     imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
5833     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
5834     imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
5835     imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
5836
5837     imp->composite = sse2_composite;
5838     imp->blt = sse2_blt;
5839     imp->fill = sse2_fill;
5840
5841     return imp;
5842 }
5843
5844 #endif /* USE_SSE2 */