Add sse2 version of add_n_8888_8888()
[profile/ivi/pixman.git] / pixman / pixman-sse2.c
1 /*
2  * Copyright © 2008 Rodrigo Kumpera
3  * Copyright © 2008 André Tupinambá
4  *
5  * Permission to use, copy, modify, distribute, and sell this software and its
6  * documentation for any purpose is hereby granted without fee, provided that
7  * the above copyright notice appear in all copies and that both that
8  * copyright notice and this permission notice appear in supporting
9  * documentation, and that the name of Red Hat not be used in advertising or
10  * publicity pertaining to distribution of the software without specific,
11  * written prior permission.  Red Hat makes no representations about the
12  * suitability of this software for any purpose.  It is provided "as is"
13  * without express or implied warranty.
14  *
15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22  * SOFTWARE.
23  *
24  * Author:  Rodrigo Kumpera (kumpera@gmail.com)
25  *          André Tupinambá (andrelrt@gmail.com)
26  *
27  * Based on work by Owen Taylor and Søren Sandmann
28  */
29 #ifdef HAVE_CONFIG_H
30 #include <config.h>
31 #endif
32
33 #include <mmintrin.h>
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
37 #include "pixman-combine32.h"
38
39 #if defined(_MSC_VER) && defined(_M_AMD64)
40 /* Windows 64 doesn't allow MMX to be used, so
41  * the pixman-x64-mmx-emulation.h file contains
42  * implementations of those MMX intrinsics that
43  * are used in the SSE2 implementation.
44  */
45 #   include "pixman-x64-mmx-emulation.h"
46 #endif
47
48 #ifdef USE_SSE2
49
50 /* --------------------------------------------------------------------
51  * Locals
52  */
53
54 static __m64 mask_x0080;
55 static __m64 mask_x00ff;
56 static __m64 mask_x0101;
57 static __m64 mask_x_alpha;
58
59 static __m64 mask_x565_rgb;
60 static __m64 mask_x565_unpack;
61
62 static __m128i mask_0080;
63 static __m128i mask_00ff;
64 static __m128i mask_0101;
65 static __m128i mask_ffff;
66 static __m128i mask_ff000000;
67 static __m128i mask_alpha;
68
69 static __m128i mask_565_r;
70 static __m128i mask_565_g1, mask_565_g2;
71 static __m128i mask_565_b;
72 static __m128i mask_red;
73 static __m128i mask_green;
74 static __m128i mask_blue;
75
76 static __m128i mask_565_fix_rb;
77 static __m128i mask_565_fix_g;
78
79 /* ----------------------------------------------------------------------
80  * SSE2 Inlines
81  */
82 static force_inline __m128i
83 unpack_32_1x128 (uint32_t data)
84 {
85     return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
86 }
87
88 static force_inline void
89 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
90 {
91     *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
92     *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
93 }
94
95 static force_inline __m128i
96 unpack_565_to_8888 (__m128i lo)
97 {
98     __m128i r, g, b, rb, t;
99
100     r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
101     g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
102     b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
103
104     rb = _mm_or_si128 (r, b);
105     t  = _mm_and_si128 (rb, mask_565_fix_rb);
106     t  = _mm_srli_epi32 (t, 5);
107     rb = _mm_or_si128 (rb, t);
108
109     t  = _mm_and_si128 (g, mask_565_fix_g);
110     t  = _mm_srli_epi32 (t, 6);
111     g  = _mm_or_si128 (g, t);
112
113     return _mm_or_si128 (rb, g);
114 }
115
116 static force_inline void
117 unpack_565_128_4x128 (__m128i  data,
118                       __m128i* data0,
119                       __m128i* data1,
120                       __m128i* data2,
121                       __m128i* data3)
122 {
123     __m128i lo, hi;
124
125     lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
126     hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
127
128     lo = unpack_565_to_8888 (lo);
129     hi = unpack_565_to_8888 (hi);
130
131     unpack_128_2x128 (lo, data0, data1);
132     unpack_128_2x128 (hi, data2, data3);
133 }
134
135 static force_inline uint16_t
136 pack_565_32_16 (uint32_t pixel)
137 {
138     return (uint16_t) (((pixel >> 8) & 0xf800) |
139                        ((pixel >> 5) & 0x07e0) |
140                        ((pixel >> 3) & 0x001f));
141 }
142
143 static force_inline __m128i
144 pack_2x128_128 (__m128i lo, __m128i hi)
145 {
146     return _mm_packus_epi16 (lo, hi);
147 }
148
149 static force_inline __m128i
150 pack_565_2x128_128 (__m128i lo, __m128i hi)
151 {
152     __m128i data;
153     __m128i r, g1, g2, b;
154
155     data = pack_2x128_128 (lo, hi);
156
157     r  = _mm_and_si128 (data, mask_565_r);
158     g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
159     g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
160     b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
161
162     return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
163 }
164
165 static force_inline __m128i
166 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
167 {
168     return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
169                              pack_565_2x128_128 (*xmm2, *xmm3));
170 }
171
172 static force_inline int
173 is_opaque (__m128i x)
174 {
175     __m128i ffs = _mm_cmpeq_epi8 (x, x);
176
177     return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
178 }
179
180 static force_inline int
181 is_zero (__m128i x)
182 {
183     return _mm_movemask_epi8 (
184         _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
185 }
186
187 static force_inline int
188 is_transparent (__m128i x)
189 {
190     return (_mm_movemask_epi8 (
191                 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
192 }
193
194 static force_inline __m128i
195 expand_pixel_32_1x128 (uint32_t data)
196 {
197     return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
198 }
199
200 static force_inline __m128i
201 expand_alpha_1x128 (__m128i data)
202 {
203     return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
204                                                      _MM_SHUFFLE (3, 3, 3, 3)),
205                                 _MM_SHUFFLE (3, 3, 3, 3));
206 }
207
208 static force_inline void
209 expand_alpha_2x128 (__m128i  data_lo,
210                     __m128i  data_hi,
211                     __m128i* alpha_lo,
212                     __m128i* alpha_hi)
213 {
214     __m128i lo, hi;
215
216     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
217     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
218
219     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
220     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
221 }
222
223 static force_inline void
224 expand_alpha_rev_2x128 (__m128i  data_lo,
225                         __m128i  data_hi,
226                         __m128i* alpha_lo,
227                         __m128i* alpha_hi)
228 {
229     __m128i lo, hi;
230
231     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
232     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
233     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
234     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
235 }
236
237 static force_inline void
238 pix_multiply_2x128 (__m128i* data_lo,
239                     __m128i* data_hi,
240                     __m128i* alpha_lo,
241                     __m128i* alpha_hi,
242                     __m128i* ret_lo,
243                     __m128i* ret_hi)
244 {
245     __m128i lo, hi;
246
247     lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
248     hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
249     lo = _mm_adds_epu16 (lo, mask_0080);
250     hi = _mm_adds_epu16 (hi, mask_0080);
251     *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
252     *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
253 }
254
255 static force_inline void
256 pix_add_multiply_2x128 (__m128i* src_lo,
257                         __m128i* src_hi,
258                         __m128i* alpha_dst_lo,
259                         __m128i* alpha_dst_hi,
260                         __m128i* dst_lo,
261                         __m128i* dst_hi,
262                         __m128i* alpha_src_lo,
263                         __m128i* alpha_src_hi,
264                         __m128i* ret_lo,
265                         __m128i* ret_hi)
266 {
267     __m128i lo, hi;
268     __m128i mul_lo, mul_hi;
269
270     lo = _mm_mullo_epi16 (*src_lo, *alpha_dst_lo);
271     hi = _mm_mullo_epi16 (*src_hi, *alpha_dst_hi);
272     mul_lo = _mm_mullo_epi16 (*dst_lo, *alpha_src_lo);
273     mul_hi = _mm_mullo_epi16 (*dst_hi, *alpha_src_hi);
274     lo = _mm_adds_epu16 (lo, mask_0080);
275     hi = _mm_adds_epu16 (hi, mask_0080);
276     lo = _mm_adds_epu16 (lo, mul_lo);
277     hi = _mm_adds_epu16 (hi, mul_hi);
278     *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
279     *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
280 }
281
282 static force_inline void
283 negate_2x128 (__m128i  data_lo,
284               __m128i  data_hi,
285               __m128i* neg_lo,
286               __m128i* neg_hi)
287 {
288     *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
289     *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
290 }
291
292 static force_inline void
293 invert_colors_2x128 (__m128i  data_lo,
294                      __m128i  data_hi,
295                      __m128i* inv_lo,
296                      __m128i* inv_hi)
297 {
298     __m128i lo, hi;
299
300     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
301     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
302     *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
303     *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
304 }
305
306 static force_inline void
307 over_2x128 (__m128i* src_lo,
308             __m128i* src_hi,
309             __m128i* alpha_lo,
310             __m128i* alpha_hi,
311             __m128i* dst_lo,
312             __m128i* dst_hi)
313 {
314     __m128i t1, t2;
315
316     negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
317
318     pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
319
320     *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
321     *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
322 }
323
324 static force_inline void
325 over_rev_non_pre_2x128 (__m128i  src_lo,
326                         __m128i  src_hi,
327                         __m128i* dst_lo,
328                         __m128i* dst_hi)
329 {
330     __m128i lo, hi;
331     __m128i alpha_lo, alpha_hi;
332
333     expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
334
335     lo = _mm_or_si128 (alpha_lo, mask_alpha);
336     hi = _mm_or_si128 (alpha_hi, mask_alpha);
337
338     invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
339
340     pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
341
342     over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
343 }
344
345 static force_inline void
346 in_over_2x128 (__m128i* src_lo,
347                __m128i* src_hi,
348                __m128i* alpha_lo,
349                __m128i* alpha_hi,
350                __m128i* mask_lo,
351                __m128i* mask_hi,
352                __m128i* dst_lo,
353                __m128i* dst_hi)
354 {
355     __m128i s_lo, s_hi;
356     __m128i a_lo, a_hi;
357
358     pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
359     pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
360
361     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
362 }
363
364 static force_inline void
365 cache_prefetch (__m128i* addr)
366 {
367     _mm_prefetch (addr, _MM_HINT_T0);
368 }
369
370 static force_inline void
371 cache_prefetch_next (__m128i* addr)
372 {
373     _mm_prefetch (addr + 4, _MM_HINT_T0); /* 64 bytes ahead */
374 }
375
376 /* load 4 pixels from a 16-byte boundary aligned address */
377 static force_inline __m128i
378 load_128_aligned (__m128i* src)
379 {
380     return _mm_load_si128 (src);
381 }
382
383 /* load 4 pixels from a unaligned address */
384 static force_inline __m128i
385 load_128_unaligned (const __m128i* src)
386 {
387     return _mm_loadu_si128 (src);
388 }
389
390 /* save 4 pixels using Write Combining memory on a 16-byte
391  * boundary aligned address
392  */
393 static force_inline void
394 save_128_write_combining (__m128i* dst,
395                           __m128i  data)
396 {
397     _mm_stream_si128 (dst, data);
398 }
399
400 /* save 4 pixels on a 16-byte boundary aligned address */
401 static force_inline void
402 save_128_aligned (__m128i* dst,
403                   __m128i  data)
404 {
405     _mm_store_si128 (dst, data);
406 }
407
408 /* save 4 pixels on a unaligned address */
409 static force_inline void
410 save_128_unaligned (__m128i* dst,
411                     __m128i  data)
412 {
413     _mm_storeu_si128 (dst, data);
414 }
415
416 /* ------------------------------------------------------------------
417  * MMX inlines
418  */
419
420 static force_inline __m64
421 unpack_32_1x64 (uint32_t data)
422 {
423     return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64 ());
424 }
425
426 static force_inline __m64
427 expand_alpha_1x64 (__m64 data)
428 {
429     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
430 }
431
432 static force_inline __m64
433 expand_alpha_rev_1x64 (__m64 data)
434 {
435     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
436 }
437
438 static force_inline __m64
439 expand_pixel_8_1x64 (uint8_t data)
440 {
441     return _mm_shuffle_pi16 (
442         unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
443 }
444
445 static force_inline __m64
446 pix_multiply_1x64 (__m64 data,
447                    __m64 alpha)
448 {
449     return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
450                                           mask_x0080),
451                            mask_x0101);
452 }
453
454 static force_inline __m64
455 pix_add_multiply_1x64 (__m64* src,
456                        __m64* alpha_dst,
457                        __m64* dst,
458                        __m64* alpha_src)
459 {
460     return _mm_mulhi_pu16 (
461         _mm_adds_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (*src, *alpha_dst),
462                                       mask_x0080),
463                        _mm_mullo_pi16 (*dst, *alpha_src)),
464         mask_x0101);
465 }
466
467 static force_inline __m64
468 negate_1x64 (__m64 data)
469 {
470     return _mm_xor_si64 (data, mask_x00ff);
471 }
472
473 static force_inline __m64
474 invert_colors_1x64 (__m64 data)
475 {
476     return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
477 }
478
479 static force_inline __m64
480 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
481 {
482     return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
483 }
484
485 static force_inline __m64
486 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
487 {
488     return over_1x64 (pix_multiply_1x64 (*src, *mask),
489                       pix_multiply_1x64 (*alpha, *mask),
490                       *dst);
491 }
492
493 static force_inline __m64
494 over_rev_non_pre_1x64 (__m64 src, __m64 dst)
495 {
496     __m64 alpha = expand_alpha_1x64 (src);
497
498     return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
499                                          _mm_or_si64 (alpha, mask_x_alpha)),
500                       alpha,
501                       dst);
502 }
503
504 static force_inline uint32_t
505 pack_1x64_32 (__m64 data)
506 {
507     return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
508 }
509
510 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
511  *
512  *    00RR00GG00BB
513  *
514  * --- Expanding 565 in the low word ---
515  *
516  * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
517  * m = m & (01f0003f001f);
518  * m = m * (008404100840);
519  * m = m >> 8;
520  *
521  * Note the trick here - the top word is shifted by another nibble to
522  * avoid it bumping into the middle word
523  */
524 static force_inline __m64
525 expand565_16_1x64 (uint16_t pixel)
526 {
527     __m64 p;
528     __m64 t1, t2;
529
530     p = _mm_cvtsi32_si64 ((uint32_t) pixel);
531
532     t1 = _mm_slli_si64 (p, 36 - 11);
533     t2 = _mm_slli_si64 (p, 16 - 5);
534
535     p = _mm_or_si64 (t1, p);
536     p = _mm_or_si64 (t2, p);
537     p = _mm_and_si64 (p, mask_x565_rgb);
538     p = _mm_mullo_pi16 (p, mask_x565_unpack);
539
540     return _mm_srli_pi16 (p, 8);
541 }
542
543 /* ----------------------------------------------------------------------------
544  * Compose Core transformations
545  */
546 static force_inline uint32_t
547 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
548 {
549     uint8_t a;
550     __m64 ms;
551
552     a = src >> 24;
553
554     if (a == 0xff)
555     {
556         return src;
557     }
558     else if (src)
559     {
560         ms = unpack_32_1x64 (src);
561         return pack_1x64_32 (
562             over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
563     }
564
565     return dst;
566 }
567
568 static force_inline uint32_t
569 combine1 (const uint32_t *ps, const uint32_t *pm)
570 {
571     uint32_t s = *ps;
572
573     if (pm)
574     {
575         __m64 ms, mm;
576
577         mm = unpack_32_1x64 (*pm);
578         mm = expand_alpha_1x64 (mm);
579
580         ms = unpack_32_1x64 (s);
581         ms = pix_multiply_1x64 (ms, mm);
582
583         s = pack_1x64_32 (ms);
584     }
585
586     return s;
587 }
588
589 static force_inline __m128i
590 combine4 (const __m128i *ps, const __m128i *pm)
591 {
592     __m128i xmm_src_lo, xmm_src_hi;
593     __m128i xmm_msk_lo, xmm_msk_hi;
594     __m128i s;
595
596     if (pm)
597     {
598         xmm_msk_lo = load_128_unaligned (pm);
599
600         if (is_transparent (xmm_msk_lo))
601             return _mm_setzero_si128 ();
602     }
603
604     s = load_128_unaligned (ps);
605
606     if (pm)
607     {
608         unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
609         unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
610
611         expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
612
613         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
614                             &xmm_msk_lo, &xmm_msk_hi,
615                             &xmm_src_lo, &xmm_src_hi);
616
617         s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
618     }
619
620     return s;
621 }
622
623 static force_inline void
624 core_combine_over_u_sse2 (uint32_t*       pd,
625                           const uint32_t* ps,
626                           const uint32_t* pm,
627                           int             w)
628 {
629     uint32_t s, d;
630
631     __m128i xmm_dst_lo, xmm_dst_hi;
632     __m128i xmm_src_lo, xmm_src_hi;
633     __m128i xmm_alpha_lo, xmm_alpha_hi;
634
635     /* call prefetch hint to optimize cache load*/
636     cache_prefetch ((__m128i*)ps);
637     cache_prefetch ((__m128i*)pd);
638     cache_prefetch ((__m128i*)pm);
639
640     /* Align dst on a 16-byte boundary */
641     while (w && ((unsigned long)pd & 15))
642     {
643         d = *pd;
644         s = combine1 (ps, pm);
645
646         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
647         ps++;
648         if (pm)
649             pm++;
650         w--;
651     }
652
653     /* call prefetch hint to optimize cache load*/
654     cache_prefetch ((__m128i*)ps);
655     cache_prefetch ((__m128i*)pd);
656     cache_prefetch ((__m128i*)pm);
657
658     while (w >= 4)
659     {
660         /* fill cache line with next memory */
661         cache_prefetch_next ((__m128i*)ps);
662         cache_prefetch_next ((__m128i*)pd);
663         cache_prefetch_next ((__m128i*)pm);
664
665         /* I'm loading unaligned because I'm not sure about
666          * the address alignment.
667          */
668         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
669
670         if (is_opaque (xmm_src_hi))
671         {
672             save_128_aligned ((__m128i*)pd, xmm_src_hi);
673         }
674         else if (!is_zero (xmm_src_hi))
675         {
676             xmm_dst_hi = load_128_aligned ((__m128i*) pd);
677
678             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
679             unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
680
681             expand_alpha_2x128 (
682                 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
683
684             over_2x128 (&xmm_src_lo, &xmm_src_hi,
685                         &xmm_alpha_lo, &xmm_alpha_hi,
686                         &xmm_dst_lo, &xmm_dst_hi);
687
688             /* rebuid the 4 pixel data and save*/
689             save_128_aligned ((__m128i*)pd,
690                               pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
691         }
692
693         w -= 4;
694         ps += 4;
695         pd += 4;
696         if (pm)
697             pm += 4;
698     }
699
700     while (w)
701     {
702         d = *pd;
703         s = combine1 (ps, pm);
704
705         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
706         ps++;
707         if (pm)
708             pm++;
709
710         w--;
711     }
712 }
713
714 static force_inline void
715 core_combine_over_reverse_u_sse2 (uint32_t*       pd,
716                                   const uint32_t* ps,
717                                   const uint32_t* pm,
718                                   int             w)
719 {
720     uint32_t s, d;
721
722     __m128i xmm_dst_lo, xmm_dst_hi;
723     __m128i xmm_src_lo, xmm_src_hi;
724     __m128i xmm_alpha_lo, xmm_alpha_hi;
725
726     /* call prefetch hint to optimize cache load*/
727     cache_prefetch ((__m128i*)ps);
728     cache_prefetch ((__m128i*)pd);
729     cache_prefetch ((__m128i*)pm);
730
731     /* Align dst on a 16-byte boundary */
732     while (w &&
733            ((unsigned long)pd & 15))
734     {
735         d = *pd;
736         s = combine1 (ps, pm);
737
738         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
739         w--;
740         ps++;
741         if (pm)
742             pm++;
743     }
744
745     /* call prefetch hint to optimize cache load*/
746     cache_prefetch ((__m128i*)ps);
747     cache_prefetch ((__m128i*)pd);
748     cache_prefetch ((__m128i*)pm);
749
750     while (w >= 4)
751     {
752         /* fill cache line with next memory */
753         cache_prefetch_next ((__m128i*)ps);
754         cache_prefetch_next ((__m128i*)pd);
755         cache_prefetch_next ((__m128i*)pm);
756
757         /* I'm loading unaligned because I'm not sure
758          * about the address alignment.
759          */
760         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
761         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
762
763         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
764         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
765
766         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
767                             &xmm_alpha_lo, &xmm_alpha_hi);
768
769         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
770                     &xmm_alpha_lo, &xmm_alpha_hi,
771                     &xmm_src_lo, &xmm_src_hi);
772
773         /* rebuid the 4 pixel data and save*/
774         save_128_aligned ((__m128i*)pd,
775                           pack_2x128_128 (xmm_src_lo, xmm_src_hi));
776
777         w -= 4;
778         ps += 4;
779         pd += 4;
780
781         if (pm)
782             pm += 4;
783     }
784
785     while (w)
786     {
787         d = *pd;
788         s = combine1 (ps, pm);
789
790         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
791         ps++;
792         w--;
793         if (pm)
794             pm++;
795     }
796 }
797
798 static force_inline uint32_t
799 core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
800 {
801     uint32_t maska = src >> 24;
802
803     if (maska == 0)
804     {
805         return 0;
806     }
807     else if (maska != 0xff)
808     {
809         return pack_1x64_32 (
810             pix_multiply_1x64 (unpack_32_1x64 (dst),
811                                expand_alpha_1x64 (unpack_32_1x64 (src))));
812     }
813
814     return dst;
815 }
816
817 static force_inline void
818 core_combine_in_u_sse2 (uint32_t*       pd,
819                         const uint32_t* ps,
820                         const uint32_t* pm,
821                         int             w)
822 {
823     uint32_t s, d;
824
825     __m128i xmm_src_lo, xmm_src_hi;
826     __m128i xmm_dst_lo, xmm_dst_hi;
827
828     /* call prefetch hint to optimize cache load*/
829     cache_prefetch ((__m128i*)ps);
830     cache_prefetch ((__m128i*)pd);
831     cache_prefetch ((__m128i*)pm);
832
833     while (w && ((unsigned long) pd & 15))
834     {
835         s = combine1 (ps, pm);
836         d = *pd;
837
838         *pd++ = core_combine_in_u_pixelsse2 (d, s);
839         w--;
840         ps++;
841         if (pm)
842             pm++;
843     }
844
845     /* call prefetch hint to optimize cache load*/
846     cache_prefetch ((__m128i*)ps);
847     cache_prefetch ((__m128i*)pd);
848     cache_prefetch ((__m128i*)pm);
849
850     while (w >= 4)
851     {
852         /* fill cache line with next memory */
853         cache_prefetch_next ((__m128i*)ps);
854         cache_prefetch_next ((__m128i*)pd);
855         cache_prefetch_next ((__m128i*)pm);
856
857         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
858         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
859
860         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
861         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
862
863         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
864         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
865                             &xmm_dst_lo, &xmm_dst_hi,
866                             &xmm_dst_lo, &xmm_dst_hi);
867
868         save_128_aligned ((__m128i*)pd,
869                           pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
870
871         ps += 4;
872         pd += 4;
873         w -= 4;
874         if (pm)
875             pm += 4;
876     }
877
878     while (w)
879     {
880         s = combine1 (ps, pm);
881         d = *pd;
882
883         *pd++ = core_combine_in_u_pixelsse2 (d, s);
884         w--;
885         ps++;
886         if (pm)
887             pm++;
888     }
889 }
890
891 static force_inline void
892 core_combine_reverse_in_u_sse2 (uint32_t*       pd,
893                                 const uint32_t* ps,
894                                 const uint32_t *pm,
895                                 int             w)
896 {
897     uint32_t s, d;
898
899     __m128i xmm_src_lo, xmm_src_hi;
900     __m128i xmm_dst_lo, xmm_dst_hi;
901
902     /* call prefetch hint to optimize cache load*/
903     cache_prefetch ((__m128i*)ps);
904     cache_prefetch ((__m128i*)pd);
905     cache_prefetch ((__m128i*)pm);
906
907     while (w && ((unsigned long) pd & 15))
908     {
909         s = combine1 (ps, pm);
910         d = *pd;
911
912         *pd++ = core_combine_in_u_pixelsse2 (s, d);
913         ps++;
914         w--;
915         if (pm)
916             pm++;
917     }
918
919     /* call prefetch hint to optimize cache load*/
920     cache_prefetch ((__m128i*)ps);
921     cache_prefetch ((__m128i*)pd);
922     cache_prefetch ((__m128i*)pm);
923
924     while (w >= 4)
925     {
926         /* fill cache line with next memory */
927         cache_prefetch_next ((__m128i*)ps);
928         cache_prefetch_next ((__m128i*)pd);
929         cache_prefetch_next ((__m128i*)pm);
930
931         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
932         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
933
934         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
935         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
936
937         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
938         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
939                             &xmm_src_lo, &xmm_src_hi,
940                             &xmm_dst_lo, &xmm_dst_hi);
941
942         save_128_aligned (
943             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
944
945         ps += 4;
946         pd += 4;
947         w -= 4;
948         if (pm)
949             pm += 4;
950     }
951
952     while (w)
953     {
954         s = combine1 (ps, pm);
955         d = *pd;
956
957         *pd++ = core_combine_in_u_pixelsse2 (s, d);
958         w--;
959         ps++;
960         if (pm)
961             pm++;
962     }
963 }
964
965 static force_inline void
966 core_combine_reverse_out_u_sse2 (uint32_t*       pd,
967                                  const uint32_t* ps,
968                                  const uint32_t* pm,
969                                  int             w)
970 {
971     /* call prefetch hint to optimize cache load*/
972     cache_prefetch ((__m128i*)ps);
973     cache_prefetch ((__m128i*)pd);
974     cache_prefetch ((__m128i*)pm);
975
976     while (w && ((unsigned long) pd & 15))
977     {
978         uint32_t s = combine1 (ps, pm);
979         uint32_t d = *pd;
980
981         *pd++ = pack_1x64_32 (
982             pix_multiply_1x64 (
983                 unpack_32_1x64 (d), negate_1x64 (
984                     expand_alpha_1x64 (unpack_32_1x64 (s)))));
985         
986         if (pm)
987             pm++;
988         ps++;
989         w--;
990     }
991
992     /* call prefetch hint to optimize cache load*/
993     cache_prefetch ((__m128i*)ps);
994     cache_prefetch ((__m128i*)pd);
995     cache_prefetch ((__m128i*)pm);
996
997     while (w >= 4)
998     {
999         __m128i xmm_src_lo, xmm_src_hi;
1000         __m128i xmm_dst_lo, xmm_dst_hi;
1001
1002         /* fill cache line with next memory */
1003         cache_prefetch_next ((__m128i*)ps);
1004         cache_prefetch_next ((__m128i*)pd);
1005         cache_prefetch_next ((__m128i*)pm);
1006
1007         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1008         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1009
1010         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1011         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1012
1013         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1014         negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1015
1016         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1017                             &xmm_src_lo, &xmm_src_hi,
1018                             &xmm_dst_lo, &xmm_dst_hi);
1019
1020         save_128_aligned (
1021             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1022
1023         ps += 4;
1024         pd += 4;
1025         if (pm)
1026             pm += 4;
1027
1028         w -= 4;
1029     }
1030
1031     while (w)
1032     {
1033         uint32_t s = combine1 (ps, pm);
1034         uint32_t d = *pd;
1035
1036         *pd++ = pack_1x64_32 (
1037             pix_multiply_1x64 (
1038                 unpack_32_1x64 (d), negate_1x64 (
1039                     expand_alpha_1x64 (unpack_32_1x64 (s)))));
1040         ps++;
1041         if (pm)
1042             pm++;
1043         w--;
1044     }
1045 }
1046
1047 static force_inline void
1048 core_combine_out_u_sse2 (uint32_t*       pd,
1049                          const uint32_t* ps,
1050                          const uint32_t* pm,
1051                          int             w)
1052 {
1053     /* call prefetch hint to optimize cache load*/
1054     cache_prefetch ((__m128i*)ps);
1055     cache_prefetch ((__m128i*)pd);
1056     cache_prefetch ((__m128i*)pm);
1057
1058     while (w && ((unsigned long) pd & 15))
1059     {
1060         uint32_t s = combine1 (ps, pm);
1061         uint32_t d = *pd;
1062
1063         *pd++ = pack_1x64_32 (
1064             pix_multiply_1x64 (
1065                 unpack_32_1x64 (s), negate_1x64 (
1066                     expand_alpha_1x64 (unpack_32_1x64 (d)))));
1067         w--;
1068         ps++;
1069         if (pm)
1070             pm++;
1071     }
1072
1073     /* call prefetch hint to optimize cache load*/
1074     cache_prefetch ((__m128i*)ps);
1075     cache_prefetch ((__m128i*)pd);
1076     cache_prefetch ((__m128i*)pm);
1077
1078     while (w >= 4)
1079     {
1080         __m128i xmm_src_lo, xmm_src_hi;
1081         __m128i xmm_dst_lo, xmm_dst_hi;
1082
1083         /* fill cache line with next memory */
1084         cache_prefetch_next ((__m128i*)ps);
1085         cache_prefetch_next ((__m128i*)pd);
1086         cache_prefetch_next ((__m128i*)pm);
1087
1088         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1089         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1090
1091         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1092         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1093
1094         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1095         negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1096
1097         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1098                             &xmm_dst_lo, &xmm_dst_hi,
1099                             &xmm_dst_lo, &xmm_dst_hi);
1100
1101         save_128_aligned (
1102             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1103
1104         ps += 4;
1105         pd += 4;
1106         w -= 4;
1107         if (pm)
1108             pm += 4;
1109     }
1110
1111     while (w)
1112     {
1113         uint32_t s = combine1 (ps, pm);
1114         uint32_t d = *pd;
1115
1116         *pd++ = pack_1x64_32 (
1117             pix_multiply_1x64 (
1118                 unpack_32_1x64 (s), negate_1x64 (
1119                     expand_alpha_1x64 (unpack_32_1x64 (d)))));
1120         w--;
1121         ps++;
1122         if (pm)
1123             pm++;
1124     }
1125 }
1126
1127 static force_inline uint32_t
1128 core_combine_atop_u_pixel_sse2 (uint32_t src,
1129                                 uint32_t dst)
1130 {
1131     __m64 s = unpack_32_1x64 (src);
1132     __m64 d = unpack_32_1x64 (dst);
1133
1134     __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
1135     __m64 da = expand_alpha_1x64 (d);
1136
1137     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1138 }
1139
1140 static force_inline void
1141 core_combine_atop_u_sse2 (uint32_t*       pd,
1142                           const uint32_t* ps,
1143                           const uint32_t* pm,
1144                           int             w)
1145 {
1146     uint32_t s, d;
1147
1148     __m128i xmm_src_lo, xmm_src_hi;
1149     __m128i xmm_dst_lo, xmm_dst_hi;
1150     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1151     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1152
1153     /* call prefetch hint to optimize cache load*/
1154     cache_prefetch ((__m128i*)ps);
1155     cache_prefetch ((__m128i*)pd);
1156     cache_prefetch ((__m128i*)pm);
1157
1158     while (w && ((unsigned long) pd & 15))
1159     {
1160         s = combine1 (ps, pm);
1161         d = *pd;
1162
1163         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1164         w--;
1165         ps++;
1166         if (pm)
1167             pm++;
1168     }
1169
1170     /* call prefetch hint to optimize cache load*/
1171     cache_prefetch ((__m128i*)ps);
1172     cache_prefetch ((__m128i*)pd);
1173     cache_prefetch ((__m128i*)pm);
1174
1175     while (w >= 4)
1176     {
1177         /* fill cache line with next memory */
1178         cache_prefetch_next ((__m128i*)ps);
1179         cache_prefetch_next ((__m128i*)pd);
1180         cache_prefetch_next ((__m128i*)pm);
1181
1182         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1183         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1184
1185         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1186         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1187
1188         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1189                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1190         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1191                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1192
1193         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1194                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1195
1196         pix_add_multiply_2x128 (
1197             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1198             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1199             &xmm_dst_lo, &xmm_dst_hi);
1200
1201         save_128_aligned (
1202             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1203
1204         ps += 4;
1205         pd += 4;
1206         w -= 4;
1207         if (pm)
1208             pm += 4;
1209     }
1210
1211     while (w)
1212     {
1213         s = combine1 (ps, pm);
1214         d = *pd;
1215
1216         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1217         w--;
1218         ps++;
1219         if (pm)
1220             pm++;
1221     }
1222 }
1223
1224 static force_inline uint32_t
1225 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1226                                         uint32_t dst)
1227 {
1228     __m64 s = unpack_32_1x64 (src);
1229     __m64 d = unpack_32_1x64 (dst);
1230
1231     __m64 sa = expand_alpha_1x64 (s);
1232     __m64 da = negate_1x64 (expand_alpha_1x64 (d));
1233
1234     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
1235 }
1236
1237 static force_inline void
1238 core_combine_reverse_atop_u_sse2 (uint32_t*       pd,
1239                                   const uint32_t* ps,
1240                                   const uint32_t* pm,
1241                                   int             w)
1242 {
1243     uint32_t s, d;
1244
1245     __m128i xmm_src_lo, xmm_src_hi;
1246     __m128i xmm_dst_lo, xmm_dst_hi;
1247     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1248     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1249
1250     /* call prefetch hint to optimize cache load*/
1251     cache_prefetch ((__m128i*)ps);
1252     cache_prefetch ((__m128i*)pd);
1253     cache_prefetch ((__m128i*)pm);
1254
1255     while (w && ((unsigned long) pd & 15))
1256     {
1257         s = combine1 (ps, pm);
1258         d = *pd;
1259
1260         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1261         ps++;
1262         w--;
1263         if (pm)
1264             pm++;
1265     }
1266
1267     /* call prefetch hint to optimize cache load*/
1268     cache_prefetch ((__m128i*)ps);
1269     cache_prefetch ((__m128i*)pd);
1270     cache_prefetch ((__m128i*)pm);
1271
1272     while (w >= 4)
1273     {
1274         /* fill cache line with next memory */
1275         cache_prefetch_next ((__m128i*)ps);
1276         cache_prefetch_next ((__m128i*)pd);
1277         cache_prefetch_next ((__m128i*)pm);
1278
1279         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1280         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1281
1282         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1283         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1284
1285         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1286                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1287         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1288                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1289
1290         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1291                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1292
1293         pix_add_multiply_2x128 (
1294             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1295             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1296             &xmm_dst_lo, &xmm_dst_hi);
1297
1298         save_128_aligned (
1299             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1300
1301         ps += 4;
1302         pd += 4;
1303         w -= 4;
1304         if (pm)
1305             pm += 4;
1306     }
1307
1308     while (w)
1309     {
1310         s = combine1 (ps, pm);
1311         d = *pd;
1312
1313         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1314         ps++;
1315         w--;
1316         if (pm)
1317             pm++;
1318     }
1319 }
1320
1321 static force_inline uint32_t
1322 core_combine_xor_u_pixel_sse2 (uint32_t src,
1323                                uint32_t dst)
1324 {
1325     __m64 s = unpack_32_1x64 (src);
1326     __m64 d = unpack_32_1x64 (dst);
1327
1328     __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
1329     __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
1330
1331     return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
1332 }
1333
1334 static force_inline void
1335 core_combine_xor_u_sse2 (uint32_t*       dst,
1336                          const uint32_t* src,
1337                          const uint32_t *mask,
1338                          int             width)
1339 {
1340     int w = width;
1341     uint32_t s, d;
1342     uint32_t* pd = dst;
1343     const uint32_t* ps = src;
1344     const uint32_t* pm = mask;
1345
1346     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1347     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1348     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1349     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1350
1351     /* call prefetch hint to optimize cache load*/
1352     cache_prefetch ((__m128i*)ps);
1353     cache_prefetch ((__m128i*)pd);
1354     cache_prefetch ((__m128i*)pm);
1355
1356     while (w && ((unsigned long) pd & 15))
1357     {
1358         s = combine1 (ps, pm);
1359         d = *pd;
1360
1361         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1362         w--;
1363         ps++;
1364         if (pm)
1365             pm++;
1366     }
1367
1368     /* call prefetch hint to optimize cache load*/
1369     cache_prefetch ((__m128i*)ps);
1370     cache_prefetch ((__m128i*)pd);
1371     cache_prefetch ((__m128i*)pm);
1372
1373     while (w >= 4)
1374     {
1375         /* fill cache line with next memory */
1376         cache_prefetch_next ((__m128i*)ps);
1377         cache_prefetch_next ((__m128i*)pd);
1378         cache_prefetch_next ((__m128i*)pm);
1379
1380         xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1381         xmm_dst = load_128_aligned ((__m128i*) pd);
1382
1383         unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1384         unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1385
1386         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1387                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1388         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1389                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1390
1391         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1392                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1393         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1394                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1395
1396         pix_add_multiply_2x128 (
1397             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1398             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1399             &xmm_dst_lo, &xmm_dst_hi);
1400
1401         save_128_aligned (
1402             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1403
1404         ps += 4;
1405         pd += 4;
1406         w -= 4;
1407         if (pm)
1408             pm += 4;
1409     }
1410
1411     while (w)
1412     {
1413         s = combine1 (ps, pm);
1414         d = *pd;
1415
1416         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1417         w--;
1418         ps++;
1419         if (pm)
1420             pm++;
1421     }
1422 }
1423
1424 static force_inline void
1425 core_combine_add_u_sse2 (uint32_t*       dst,
1426                          const uint32_t* src,
1427                          const uint32_t* mask,
1428                          int             width)
1429 {
1430     int w = width;
1431     uint32_t s, d;
1432     uint32_t* pd = dst;
1433     const uint32_t* ps = src;
1434     const uint32_t* pm = mask;
1435
1436     /* call prefetch hint to optimize cache load*/
1437     cache_prefetch ((__m128i*)ps);
1438     cache_prefetch ((__m128i*)pd);
1439     cache_prefetch ((__m128i*)pm);
1440
1441     while (w && (unsigned long)pd & 15)
1442     {
1443         s = combine1 (ps, pm);
1444         d = *pd;
1445
1446         ps++;
1447         if (pm)
1448             pm++;
1449         *pd++ = _mm_cvtsi64_si32 (
1450             _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1451         w--;
1452     }
1453
1454     /* call prefetch hint to optimize cache load*/
1455     cache_prefetch ((__m128i*)ps);
1456     cache_prefetch ((__m128i*)pd);
1457     cache_prefetch ((__m128i*)pm);
1458
1459     while (w >= 4)
1460     {
1461         __m128i s;
1462
1463         /* fill cache line with next memory */
1464         cache_prefetch_next ((__m128i*)ps);
1465         cache_prefetch_next ((__m128i*)pd);
1466         cache_prefetch_next ((__m128i*)pm);
1467
1468         s = combine4 ((__m128i*)ps, (__m128i*)pm);
1469
1470         save_128_aligned (
1471             (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
1472
1473         pd += 4;
1474         ps += 4;
1475         if (pm)
1476             pm += 4;
1477         w -= 4;
1478     }
1479
1480     while (w--)
1481     {
1482         s = combine1 (ps, pm);
1483         d = *pd;
1484
1485         ps++;
1486         *pd++ = _mm_cvtsi64_si32 (
1487             _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1488         if (pm)
1489             pm++;
1490     }
1491 }
1492
1493 static force_inline uint32_t
1494 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1495                                     uint32_t dst)
1496 {
1497     __m64 ms = unpack_32_1x64 (src);
1498     __m64 md = unpack_32_1x64 (dst);
1499     uint32_t sa = src >> 24;
1500     uint32_t da = ~dst >> 24;
1501
1502     if (sa > da)
1503     {
1504         ms = pix_multiply_1x64 (
1505             ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
1506     }
1507
1508     return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1509 }
1510
1511 static force_inline void
1512 core_combine_saturate_u_sse2 (uint32_t *      pd,
1513                               const uint32_t *ps,
1514                               const uint32_t *pm,
1515                               int             w)
1516 {
1517     uint32_t s, d;
1518
1519     uint32_t pack_cmp;
1520     __m128i xmm_src, xmm_dst;
1521
1522     /* call prefetch hint to optimize cache load*/
1523     cache_prefetch ((__m128i*)ps);
1524     cache_prefetch ((__m128i*)pd);
1525     cache_prefetch ((__m128i*)pm);
1526
1527     while (w && (unsigned long)pd & 15)
1528     {
1529         s = combine1 (ps, pm);
1530         d = *pd;
1531
1532         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1533         w--;
1534         ps++;
1535         if (pm)
1536             pm++;
1537     }
1538
1539     /* call prefetch hint to optimize cache load*/
1540     cache_prefetch ((__m128i*)ps);
1541     cache_prefetch ((__m128i*)pd);
1542     cache_prefetch ((__m128i*)pm);
1543
1544     while (w >= 4)
1545     {
1546         /* fill cache line with next memory */
1547         cache_prefetch_next ((__m128i*)ps);
1548         cache_prefetch_next ((__m128i*)pd);
1549         cache_prefetch_next ((__m128i*)pm);
1550
1551         xmm_dst = load_128_aligned  ((__m128i*)pd);
1552         xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1553
1554         pack_cmp = _mm_movemask_epi8 (
1555             _mm_cmpgt_epi32 (
1556                 _mm_srli_epi32 (xmm_src, 24),
1557                 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1558
1559         /* if some alpha src is grater than respective ~alpha dst */
1560         if (pack_cmp)
1561         {
1562             s = combine1 (ps++, pm);
1563             d = *pd;
1564             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1565             if (pm)
1566                 pm++;
1567
1568             s = combine1 (ps++, pm);
1569             d = *pd;
1570             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1571             if (pm)
1572                 pm++;
1573
1574             s = combine1 (ps++, pm);
1575             d = *pd;
1576             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1577             if (pm)
1578                 pm++;
1579
1580             s = combine1 (ps++, pm);
1581             d = *pd;
1582             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1583             if (pm)
1584                 pm++;
1585         }
1586         else
1587         {
1588             save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1589
1590             pd += 4;
1591             ps += 4;
1592             if (pm)
1593                 pm += 4;
1594         }
1595
1596         w -= 4;
1597     }
1598
1599     while (w--)
1600     {
1601         s = combine1 (ps, pm);
1602         d = *pd;
1603
1604         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1605         ps++;
1606         if (pm)
1607             pm++;
1608     }
1609 }
1610
1611 static force_inline void
1612 core_combine_src_ca_sse2 (uint32_t*       pd,
1613                           const uint32_t* ps,
1614                           const uint32_t *pm,
1615                           int             w)
1616 {
1617     uint32_t s, m;
1618
1619     __m128i xmm_src_lo, xmm_src_hi;
1620     __m128i xmm_mask_lo, xmm_mask_hi;
1621     __m128i xmm_dst_lo, xmm_dst_hi;
1622
1623     /* call prefetch hint to optimize cache load*/
1624     cache_prefetch ((__m128i*)ps);
1625     cache_prefetch ((__m128i*)pd);
1626     cache_prefetch ((__m128i*)pm);
1627
1628     while (w && (unsigned long)pd & 15)
1629     {
1630         s = *ps++;
1631         m = *pm++;
1632         *pd++ = pack_1x64_32 (
1633             pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1634         w--;
1635     }
1636
1637     /* call prefetch hint to optimize cache load*/
1638     cache_prefetch ((__m128i*)ps);
1639     cache_prefetch ((__m128i*)pd);
1640     cache_prefetch ((__m128i*)pm);
1641
1642     while (w >= 4)
1643     {
1644         /* fill cache line with next memory */
1645         cache_prefetch_next ((__m128i*)ps);
1646         cache_prefetch_next ((__m128i*)pd);
1647         cache_prefetch_next ((__m128i*)pm);
1648
1649         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1650         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1651
1652         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1653         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1654
1655         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1656                             &xmm_mask_lo, &xmm_mask_hi,
1657                             &xmm_dst_lo, &xmm_dst_hi);
1658
1659         save_128_aligned (
1660             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1661
1662         ps += 4;
1663         pd += 4;
1664         pm += 4;
1665         w -= 4;
1666     }
1667
1668     while (w)
1669     {
1670         s = *ps++;
1671         m = *pm++;
1672         *pd++ = pack_1x64_32 (
1673             pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1674         w--;
1675     }
1676 }
1677
1678 static force_inline uint32_t
1679 core_combine_over_ca_pixel_sse2 (uint32_t src,
1680                                  uint32_t mask,
1681                                  uint32_t dst)
1682 {
1683     __m64 s = unpack_32_1x64 (src);
1684     __m64 expAlpha = expand_alpha_1x64 (s);
1685     __m64 unpk_mask = unpack_32_1x64 (mask);
1686     __m64 unpk_dst  = unpack_32_1x64 (dst);
1687
1688     return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1689 }
1690
1691 static force_inline void
1692 core_combine_over_ca_sse2 (uint32_t*       pd,
1693                            const uint32_t* ps,
1694                            const uint32_t *pm,
1695                            int             w)
1696 {
1697     uint32_t s, m, d;
1698
1699     __m128i xmm_alpha_lo, xmm_alpha_hi;
1700     __m128i xmm_src_lo, xmm_src_hi;
1701     __m128i xmm_dst_lo, xmm_dst_hi;
1702     __m128i xmm_mask_lo, xmm_mask_hi;
1703
1704     /* call prefetch hint to optimize cache load*/
1705     cache_prefetch ((__m128i*)ps);
1706     cache_prefetch ((__m128i*)pd);
1707     cache_prefetch ((__m128i*)pm);
1708
1709     while (w && (unsigned long)pd & 15)
1710     {
1711         s = *ps++;
1712         m = *pm++;
1713         d = *pd;
1714
1715         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1716         w--;
1717     }
1718
1719     /* call prefetch hint to optimize cache load*/
1720     cache_prefetch ((__m128i*)ps);
1721     cache_prefetch ((__m128i*)pd);
1722     cache_prefetch ((__m128i*)pm);
1723
1724     while (w >= 4)
1725     {
1726         /* fill cache line with next memory */
1727         cache_prefetch_next ((__m128i*)ps);
1728         cache_prefetch_next ((__m128i*)pd);
1729         cache_prefetch_next ((__m128i*)pm);
1730
1731         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1732         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1733         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1734
1735         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1736         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1737         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1738
1739         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1740                             &xmm_alpha_lo, &xmm_alpha_hi);
1741
1742         in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1743                        &xmm_alpha_lo, &xmm_alpha_hi,
1744                        &xmm_mask_lo, &xmm_mask_hi,
1745                        &xmm_dst_lo, &xmm_dst_hi);
1746
1747         save_128_aligned (
1748             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1749
1750         ps += 4;
1751         pd += 4;
1752         pm += 4;
1753         w -= 4;
1754     }
1755
1756     while (w)
1757     {
1758         s = *ps++;
1759         m = *pm++;
1760         d = *pd;
1761
1762         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1763         w--;
1764     }
1765 }
1766
1767 static force_inline uint32_t
1768 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1769                                          uint32_t mask,
1770                                          uint32_t dst)
1771 {
1772     __m64 d = unpack_32_1x64 (dst);
1773
1774     return pack_1x64_32 (
1775         over_1x64 (d, expand_alpha_1x64 (d),
1776                    pix_multiply_1x64 (unpack_32_1x64 (src),
1777                                       unpack_32_1x64 (mask))));
1778 }
1779
1780 static force_inline void
1781 core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
1782                                    const uint32_t* ps,
1783                                    const uint32_t *pm,
1784                                    int             w)
1785 {
1786     uint32_t s, m, d;
1787
1788     __m128i xmm_alpha_lo, xmm_alpha_hi;
1789     __m128i xmm_src_lo, xmm_src_hi;
1790     __m128i xmm_dst_lo, xmm_dst_hi;
1791     __m128i xmm_mask_lo, xmm_mask_hi;
1792
1793     /* call prefetch hint to optimize cache load*/
1794     cache_prefetch ((__m128i*)ps);
1795     cache_prefetch ((__m128i*)pd);
1796     cache_prefetch ((__m128i*)pm);
1797
1798     while (w && (unsigned long)pd & 15)
1799     {
1800         s = *ps++;
1801         m = *pm++;
1802         d = *pd;
1803
1804         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1805         w--;
1806     }
1807
1808     /* call prefetch hint to optimize cache load*/
1809     cache_prefetch ((__m128i*)ps);
1810     cache_prefetch ((__m128i*)pd);
1811     cache_prefetch ((__m128i*)pm);
1812
1813     while (w >= 4)
1814     {
1815         /* fill cache line with next memory */
1816         cache_prefetch_next ((__m128i*)ps);
1817         cache_prefetch_next ((__m128i*)pd);
1818         cache_prefetch_next ((__m128i*)pm);
1819
1820         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1821         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1822         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1823
1824         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1825         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1826         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1827
1828         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1829                             &xmm_alpha_lo, &xmm_alpha_hi);
1830         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1831                             &xmm_mask_lo, &xmm_mask_hi,
1832                             &xmm_mask_lo, &xmm_mask_hi);
1833
1834         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1835                     &xmm_alpha_lo, &xmm_alpha_hi,
1836                     &xmm_mask_lo, &xmm_mask_hi);
1837
1838         save_128_aligned (
1839             (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1840
1841         ps += 4;
1842         pd += 4;
1843         pm += 4;
1844         w -= 4;
1845     }
1846
1847     while (w)
1848     {
1849         s = *ps++;
1850         m = *pm++;
1851         d = *pd;
1852
1853         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1854         w--;
1855     }
1856 }
1857
1858 static force_inline void
1859 core_combine_in_ca_sse2 (uint32_t *      pd,
1860                          const uint32_t *ps,
1861                          const uint32_t *pm,
1862                          int             w)
1863 {
1864     uint32_t s, m, d;
1865
1866     __m128i xmm_alpha_lo, xmm_alpha_hi;
1867     __m128i xmm_src_lo, xmm_src_hi;
1868     __m128i xmm_dst_lo, xmm_dst_hi;
1869     __m128i xmm_mask_lo, xmm_mask_hi;
1870
1871     /* call prefetch hint to optimize cache load*/
1872     cache_prefetch ((__m128i*)ps);
1873     cache_prefetch ((__m128i*)pd);
1874     cache_prefetch ((__m128i*)pm);
1875
1876     while (w && (unsigned long)pd & 15)
1877     {
1878         s = *ps++;
1879         m = *pm++;
1880         d = *pd;
1881
1882         *pd++ = pack_1x64_32 (
1883             pix_multiply_1x64 (
1884                 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1885                 expand_alpha_1x64 (unpack_32_1x64 (d))));
1886
1887         w--;
1888     }
1889
1890     /* call prefetch hint to optimize cache load*/
1891     cache_prefetch ((__m128i*)ps);
1892     cache_prefetch ((__m128i*)pd);
1893     cache_prefetch ((__m128i*)pm);
1894
1895     while (w >= 4)
1896     {
1897         /* fill cache line with next memory */
1898         cache_prefetch_next ((__m128i*)ps);
1899         cache_prefetch_next ((__m128i*)pd);
1900         cache_prefetch_next ((__m128i*)pm);
1901
1902         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1903         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1904         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1905
1906         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1907         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1908         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1909
1910         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1911                             &xmm_alpha_lo, &xmm_alpha_hi);
1912
1913         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1914                             &xmm_mask_lo, &xmm_mask_hi,
1915                             &xmm_dst_lo, &xmm_dst_hi);
1916
1917         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1918                             &xmm_alpha_lo, &xmm_alpha_hi,
1919                             &xmm_dst_lo, &xmm_dst_hi);
1920
1921         save_128_aligned (
1922             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1923
1924         ps += 4;
1925         pd += 4;
1926         pm += 4;
1927         w -= 4;
1928     }
1929
1930     while (w)
1931     {
1932         s = *ps++;
1933         m = *pm++;
1934         d = *pd;
1935
1936         *pd++ = pack_1x64_32 (
1937             pix_multiply_1x64 (
1938                 pix_multiply_1x64 (
1939                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
1940                 expand_alpha_1x64 (unpack_32_1x64 (d))));
1941
1942         w--;
1943     }
1944 }
1945
1946 static force_inline void
1947 core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
1948                                  const uint32_t *ps,
1949                                  const uint32_t *pm,
1950                                  int             w)
1951 {
1952     uint32_t s, m, d;
1953
1954     __m128i xmm_alpha_lo, xmm_alpha_hi;
1955     __m128i xmm_src_lo, xmm_src_hi;
1956     __m128i xmm_dst_lo, xmm_dst_hi;
1957     __m128i xmm_mask_lo, xmm_mask_hi;
1958
1959     /* call prefetch hint to optimize cache load*/
1960     cache_prefetch ((__m128i*)ps);
1961     cache_prefetch ((__m128i*)pd);
1962     cache_prefetch ((__m128i*)pm);
1963
1964     while (w && (unsigned long)pd & 15)
1965     {
1966         s = *ps++;
1967         m = *pm++;
1968         d = *pd;
1969
1970         *pd++ = pack_1x64_32 (
1971             pix_multiply_1x64 (
1972                 unpack_32_1x64 (d),
1973                 pix_multiply_1x64 (unpack_32_1x64 (m),
1974                                    expand_alpha_1x64 (unpack_32_1x64 (s)))));
1975         w--;
1976     }
1977
1978     /* call prefetch hint to optimize cache load*/
1979     cache_prefetch ((__m128i*)ps);
1980     cache_prefetch ((__m128i*)pd);
1981     cache_prefetch ((__m128i*)pm);
1982
1983     while (w >= 4)
1984     {
1985         /* fill cache line with next memory */
1986         cache_prefetch_next ((__m128i*)ps);
1987         cache_prefetch_next ((__m128i*)pd);
1988         cache_prefetch_next ((__m128i*)pm);
1989
1990         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1991         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1992         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1993
1994         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1995         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1996         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1997
1998         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1999                             &xmm_alpha_lo, &xmm_alpha_hi);
2000         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2001                             &xmm_alpha_lo, &xmm_alpha_hi,
2002                             &xmm_alpha_lo, &xmm_alpha_hi);
2003
2004         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2005                             &xmm_alpha_lo, &xmm_alpha_hi,
2006                             &xmm_dst_lo, &xmm_dst_hi);
2007
2008         save_128_aligned (
2009             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2010
2011         ps += 4;
2012         pd += 4;
2013         pm += 4;
2014         w -= 4;
2015     }
2016
2017     while (w)
2018     {
2019         s = *ps++;
2020         m = *pm++;
2021         d = *pd;
2022
2023         *pd++ = pack_1x64_32 (
2024             pix_multiply_1x64 (
2025                 unpack_32_1x64 (d),
2026                 pix_multiply_1x64 (unpack_32_1x64 (m),
2027                                    expand_alpha_1x64 (unpack_32_1x64 (s)))));
2028         w--;
2029     }
2030 }
2031
2032 static force_inline void
2033 core_combine_out_ca_sse2 (uint32_t *      pd,
2034                           const uint32_t *ps,
2035                           const uint32_t *pm,
2036                           int             w)
2037 {
2038     uint32_t s, m, d;
2039
2040     __m128i xmm_alpha_lo, xmm_alpha_hi;
2041     __m128i xmm_src_lo, xmm_src_hi;
2042     __m128i xmm_dst_lo, xmm_dst_hi;
2043     __m128i xmm_mask_lo, xmm_mask_hi;
2044
2045     /* call prefetch hint to optimize cache load*/
2046     cache_prefetch ((__m128i*)ps);
2047     cache_prefetch ((__m128i*)pd);
2048     cache_prefetch ((__m128i*)pm);
2049
2050     while (w && (unsigned long)pd & 15)
2051     {
2052         s = *ps++;
2053         m = *pm++;
2054         d = *pd;
2055
2056         *pd++ = pack_1x64_32 (
2057             pix_multiply_1x64 (
2058                 pix_multiply_1x64 (
2059                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
2060                 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2061         w--;
2062     }
2063
2064     /* call prefetch hint to optimize cache load*/
2065     cache_prefetch ((__m128i*)ps);
2066     cache_prefetch ((__m128i*)pd);
2067     cache_prefetch ((__m128i*)pm);
2068
2069     while (w >= 4)
2070     {
2071         /* fill cache line with next memory */
2072         cache_prefetch_next ((__m128i*)ps);
2073         cache_prefetch_next ((__m128i*)pd);
2074         cache_prefetch_next ((__m128i*)pm);
2075
2076         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2077         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2078         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2079
2080         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2081         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2082         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2083
2084         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2085                             &xmm_alpha_lo, &xmm_alpha_hi);
2086         negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
2087                       &xmm_alpha_lo, &xmm_alpha_hi);
2088
2089         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2090                             &xmm_mask_lo, &xmm_mask_hi,
2091                             &xmm_dst_lo, &xmm_dst_hi);
2092         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2093                             &xmm_alpha_lo, &xmm_alpha_hi,
2094                             &xmm_dst_lo, &xmm_dst_hi);
2095
2096         save_128_aligned (
2097             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2098
2099         ps += 4;
2100         pd += 4;
2101         pm += 4;
2102         w -= 4;
2103     }
2104
2105     while (w)
2106     {
2107         s = *ps++;
2108         m = *pm++;
2109         d = *pd;
2110
2111         *pd++ = pack_1x64_32 (
2112             pix_multiply_1x64 (
2113                 pix_multiply_1x64 (
2114                     unpack_32_1x64 (s), unpack_32_1x64 (m)),
2115                 negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
2116
2117         w--;
2118     }
2119 }
2120
2121 static force_inline void
2122 core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
2123                                   const uint32_t *ps,
2124                                   const uint32_t *pm,
2125                                   int             w)
2126 {
2127     uint32_t s, m, d;
2128
2129     __m128i xmm_alpha_lo, xmm_alpha_hi;
2130     __m128i xmm_src_lo, xmm_src_hi;
2131     __m128i xmm_dst_lo, xmm_dst_hi;
2132     __m128i xmm_mask_lo, xmm_mask_hi;
2133
2134     /* call prefetch hint to optimize cache load*/
2135     cache_prefetch ((__m128i*)ps);
2136     cache_prefetch ((__m128i*)pd);
2137     cache_prefetch ((__m128i*)pm);
2138
2139     while (w && (unsigned long)pd & 15)
2140     {
2141         s = *ps++;
2142         m = *pm++;
2143         d = *pd;
2144
2145         *pd++ = pack_1x64_32 (
2146             pix_multiply_1x64 (
2147                 unpack_32_1x64 (d),
2148                 negate_1x64 (pix_multiply_1x64 (
2149                                  unpack_32_1x64 (m),
2150                                  expand_alpha_1x64 (unpack_32_1x64 (s))))));
2151         w--;
2152     }
2153
2154     /* call prefetch hint to optimize cache load*/
2155     cache_prefetch ((__m128i*)ps);
2156     cache_prefetch ((__m128i*)pd);
2157     cache_prefetch ((__m128i*)pm);
2158
2159     while (w >= 4)
2160     {
2161         /* fill cache line with next memory */
2162         cache_prefetch_next ((__m128i*)ps);
2163         cache_prefetch_next ((__m128i*)pd);
2164         cache_prefetch_next ((__m128i*)pm);
2165
2166         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2167         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2168         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2169
2170         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2171         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2172         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2173
2174         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2175                             &xmm_alpha_lo, &xmm_alpha_hi);
2176
2177         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2178                             &xmm_alpha_lo, &xmm_alpha_hi,
2179                             &xmm_mask_lo, &xmm_mask_hi);
2180
2181         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2182                       &xmm_mask_lo, &xmm_mask_hi);
2183
2184         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
2185                             &xmm_mask_lo, &xmm_mask_hi,
2186                             &xmm_dst_lo, &xmm_dst_hi);
2187
2188         save_128_aligned (
2189             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2190
2191         ps += 4;
2192         pd += 4;
2193         pm += 4;
2194         w -= 4;
2195     }
2196
2197     while (w)
2198     {
2199         s = *ps++;
2200         m = *pm++;
2201         d = *pd;
2202
2203         *pd++ = pack_1x64_32 (
2204             pix_multiply_1x64 (
2205                 unpack_32_1x64 (d),
2206                 negate_1x64 (pix_multiply_1x64 (
2207                                  unpack_32_1x64 (m),
2208                                  expand_alpha_1x64 (unpack_32_1x64 (s))))));
2209         w--;
2210     }
2211 }
2212
2213 static force_inline uint32_t
2214 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2215                                  uint32_t mask,
2216                                  uint32_t dst)
2217 {
2218     __m64 m = unpack_32_1x64 (mask);
2219     __m64 s = unpack_32_1x64 (src);
2220     __m64 d = unpack_32_1x64 (dst);
2221     __m64 sa = expand_alpha_1x64 (s);
2222     __m64 da = expand_alpha_1x64 (d);
2223
2224     s = pix_multiply_1x64 (s, m);
2225     m = negate_1x64 (pix_multiply_1x64 (m, sa));
2226
2227     return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2228 }
2229
2230 static force_inline void
2231 core_combine_atop_ca_sse2 (uint32_t *      pd,
2232                            const uint32_t *ps,
2233                            const uint32_t *pm,
2234                            int             w)
2235 {
2236     uint32_t s, m, d;
2237
2238     __m128i xmm_src_lo, xmm_src_hi;
2239     __m128i xmm_dst_lo, xmm_dst_hi;
2240     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2241     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2242     __m128i xmm_mask_lo, xmm_mask_hi;
2243
2244     /* call prefetch hint to optimize cache load*/
2245     cache_prefetch ((__m128i*)ps);
2246     cache_prefetch ((__m128i*)pd);
2247     cache_prefetch ((__m128i*)pm);
2248
2249     while (w && (unsigned long)pd & 15)
2250     {
2251         s = *ps++;
2252         m = *pm++;
2253         d = *pd;
2254
2255         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2256         w--;
2257     }
2258
2259     /* call prefetch hint to optimize cache load*/
2260     cache_prefetch ((__m128i*)ps);
2261     cache_prefetch ((__m128i*)pd);
2262     cache_prefetch ((__m128i*)pm);
2263
2264     while (w >= 4)
2265     {
2266         /* fill cache line with next memory */
2267         cache_prefetch_next ((__m128i*)ps);
2268         cache_prefetch_next ((__m128i*)pd);
2269         cache_prefetch_next ((__m128i*)pm);
2270
2271         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2272         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2273         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2274
2275         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2276         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2277         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2278
2279         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2280                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2281         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2282                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2283
2284         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2285                             &xmm_mask_lo, &xmm_mask_hi,
2286                             &xmm_src_lo, &xmm_src_hi);
2287         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2288                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2289                             &xmm_mask_lo, &xmm_mask_hi);
2290
2291         negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2292
2293         pix_add_multiply_2x128 (
2294             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2295             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2296             &xmm_dst_lo, &xmm_dst_hi);
2297
2298         save_128_aligned (
2299             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2300
2301         ps += 4;
2302         pd += 4;
2303         pm += 4;
2304         w -= 4;
2305     }
2306
2307     while (w)
2308     {
2309         s = *ps++;
2310         m = *pm++;
2311         d = *pd;
2312
2313         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2314         w--;
2315     }
2316 }
2317
2318 static force_inline uint32_t
2319 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2320                                          uint32_t mask,
2321                                          uint32_t dst)
2322 {
2323     __m64 m = unpack_32_1x64 (mask);
2324     __m64 s = unpack_32_1x64 (src);
2325     __m64 d = unpack_32_1x64 (dst);
2326
2327     __m64 da = negate_1x64 (expand_alpha_1x64 (d));
2328     __m64 sa = expand_alpha_1x64 (s);
2329
2330     s = pix_multiply_1x64 (s, m);
2331     m = pix_multiply_1x64 (m, sa);
2332
2333     return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
2334 }
2335
2336 static force_inline void
2337 core_combine_reverse_atop_ca_sse2 (uint32_t *      pd,
2338                                    const uint32_t *ps,
2339                                    const uint32_t *pm,
2340                                    int             w)
2341 {
2342     uint32_t s, m, d;
2343
2344     __m128i xmm_src_lo, xmm_src_hi;
2345     __m128i xmm_dst_lo, xmm_dst_hi;
2346     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2347     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2348     __m128i xmm_mask_lo, xmm_mask_hi;
2349
2350     /* call prefetch hint to optimize cache load*/
2351     cache_prefetch ((__m128i*)ps);
2352     cache_prefetch ((__m128i*)pd);
2353     cache_prefetch ((__m128i*)pm);
2354
2355     while (w && (unsigned long)pd & 15)
2356     {
2357         s = *ps++;
2358         m = *pm++;
2359         d = *pd;
2360
2361         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2362         w--;
2363     }
2364
2365     /* call prefetch hint to optimize cache load*/
2366     cache_prefetch ((__m128i*)ps);
2367     cache_prefetch ((__m128i*)pd);
2368     cache_prefetch ((__m128i*)pm);
2369
2370     while (w >= 4)
2371     {
2372         /* fill cache line with next memory */
2373         cache_prefetch_next ((__m128i*)ps);
2374         cache_prefetch_next ((__m128i*)pd);
2375         cache_prefetch_next ((__m128i*)pm);
2376
2377         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2378         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2379         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2380
2381         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2382         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2383         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2384
2385         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2386                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2387         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2388                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2389
2390         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2391                             &xmm_mask_lo, &xmm_mask_hi,
2392                             &xmm_src_lo, &xmm_src_hi);
2393         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2394                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2395                             &xmm_mask_lo, &xmm_mask_hi);
2396
2397         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2398                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2399
2400         pix_add_multiply_2x128 (
2401             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2402             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2403             &xmm_dst_lo, &xmm_dst_hi);
2404
2405         save_128_aligned (
2406             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2407
2408         ps += 4;
2409         pd += 4;
2410         pm += 4;
2411         w -= 4;
2412     }
2413
2414     while (w)
2415     {
2416         s = *ps++;
2417         m = *pm++;
2418         d = *pd;
2419
2420         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2421         w--;
2422     }
2423 }
2424
2425 static force_inline uint32_t
2426 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2427                                 uint32_t mask,
2428                                 uint32_t dst)
2429 {
2430     __m64 a = unpack_32_1x64 (mask);
2431     __m64 s = unpack_32_1x64 (src);
2432     __m64 d = unpack_32_1x64 (dst);
2433
2434     __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
2435                                        a, expand_alpha_1x64 (s)));
2436     __m64 dest      = pix_multiply_1x64 (s, a);
2437     __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
2438
2439     return pack_1x64_32 (pix_add_multiply_1x64 (&d,
2440                                                 &alpha_dst,
2441                                                 &dest,
2442                                                 &alpha_src));
2443 }
2444
2445 static force_inline void
2446 core_combine_xor_ca_sse2 (uint32_t *      pd,
2447                           const uint32_t *ps,
2448                           const uint32_t *pm,
2449                           int             w)
2450 {
2451     uint32_t s, m, d;
2452
2453     __m128i xmm_src_lo, xmm_src_hi;
2454     __m128i xmm_dst_lo, xmm_dst_hi;
2455     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2456     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2457     __m128i xmm_mask_lo, xmm_mask_hi;
2458
2459     /* call prefetch hint to optimize cache load*/
2460     cache_prefetch ((__m128i*)ps);
2461     cache_prefetch ((__m128i*)pd);
2462     cache_prefetch ((__m128i*)pm);
2463
2464     while (w && (unsigned long)pd & 15)
2465     {
2466         s = *ps++;
2467         m = *pm++;
2468         d = *pd;
2469
2470         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2471         w--;
2472     }
2473
2474     /* call prefetch hint to optimize cache load*/
2475     cache_prefetch ((__m128i*)ps);
2476     cache_prefetch ((__m128i*)pd);
2477     cache_prefetch ((__m128i*)pm);
2478
2479     while (w >= 4)
2480     {
2481         /* fill cache line with next memory */
2482         cache_prefetch_next ((__m128i*)ps);
2483         cache_prefetch_next ((__m128i*)pd);
2484         cache_prefetch_next ((__m128i*)pm);
2485
2486         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2487         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2488         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2489
2490         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2491         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2492         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2493
2494         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2495                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2496         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2497                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2498
2499         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2500                             &xmm_mask_lo, &xmm_mask_hi,
2501                             &xmm_src_lo, &xmm_src_hi);
2502         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2503                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2504                             &xmm_mask_lo, &xmm_mask_hi);
2505
2506         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2507                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2508         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2509                       &xmm_mask_lo, &xmm_mask_hi);
2510
2511         pix_add_multiply_2x128 (
2512             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2513             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2514             &xmm_dst_lo, &xmm_dst_hi);
2515
2516         save_128_aligned (
2517             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2518
2519         ps += 4;
2520         pd += 4;
2521         pm += 4;
2522         w -= 4;
2523     }
2524
2525     while (w)
2526     {
2527         s = *ps++;
2528         m = *pm++;
2529         d = *pd;
2530
2531         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2532         w--;
2533     }
2534 }
2535
2536 static force_inline void
2537 core_combine_add_ca_sse2 (uint32_t *      pd,
2538                           const uint32_t *ps,
2539                           const uint32_t *pm,
2540                           int             w)
2541 {
2542     uint32_t s, m, d;
2543
2544     __m128i xmm_src_lo, xmm_src_hi;
2545     __m128i xmm_dst_lo, xmm_dst_hi;
2546     __m128i xmm_mask_lo, xmm_mask_hi;
2547
2548     /* call prefetch hint to optimize cache load*/
2549     cache_prefetch ((__m128i*)ps);
2550     cache_prefetch ((__m128i*)pd);
2551     cache_prefetch ((__m128i*)pm);
2552
2553     while (w && (unsigned long)pd & 15)
2554     {
2555         s = *ps++;
2556         m = *pm++;
2557         d = *pd;
2558
2559         *pd++ = pack_1x64_32 (
2560             _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2561                                              unpack_32_1x64 (m)),
2562                           unpack_32_1x64 (d)));
2563         w--;
2564     }
2565
2566     /* call prefetch hint to optimize cache load*/
2567     cache_prefetch ((__m128i*)ps);
2568     cache_prefetch ((__m128i*)pd);
2569     cache_prefetch ((__m128i*)pm);
2570
2571     while (w >= 4)
2572     {
2573         /* fill cache line with next memory */
2574         cache_prefetch_next ((__m128i*)ps);
2575         cache_prefetch_next ((__m128i*)pd);
2576         cache_prefetch_next ((__m128i*)pm);
2577
2578         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2579         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2580         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2581
2582         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2583         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2584         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2585
2586         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2587                             &xmm_mask_lo, &xmm_mask_hi,
2588                             &xmm_src_lo, &xmm_src_hi);
2589
2590         save_128_aligned (
2591             (__m128i*)pd, pack_2x128_128 (
2592                 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2593                 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2594
2595         ps += 4;
2596         pd += 4;
2597         pm += 4;
2598         w -= 4;
2599     }
2600
2601     while (w)
2602     {
2603         s = *ps++;
2604         m = *pm++;
2605         d = *pd;
2606
2607         *pd++ = pack_1x64_32 (
2608             _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
2609                                              unpack_32_1x64 (m)),
2610                           unpack_32_1x64 (d)));
2611         w--;
2612     }
2613 }
2614
2615 /* ---------------------------------------------------
2616  * fb_compose_setup_sSE2
2617  */
2618 static force_inline __m64
2619 create_mask_16_64 (uint16_t mask)
2620 {
2621     return _mm_set1_pi16 (mask);
2622 }
2623
2624 static force_inline __m128i
2625 create_mask_16_128 (uint16_t mask)
2626 {
2627     return _mm_set1_epi16 (mask);
2628 }
2629
2630 static force_inline __m64
2631 create_mask_2x32_64 (uint32_t mask0,
2632                      uint32_t mask1)
2633 {
2634     return _mm_set_pi32 (mask0, mask1);
2635 }
2636
2637 static force_inline __m128i
2638 create_mask_2x32_128 (uint32_t mask0,
2639                       uint32_t mask1)
2640 {
2641     return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2642 }
2643
2644 /* SSE2 code patch for fbcompose.c */
2645
2646 static void
2647 sse2_combine_over_u (pixman_implementation_t *imp,
2648                      pixman_op_t              op,
2649                      uint32_t *               dst,
2650                      const uint32_t *         src,
2651                      const uint32_t *         mask,
2652                      int                      width)
2653 {
2654     core_combine_over_u_sse2 (dst, src, mask, width);
2655     _mm_empty ();
2656 }
2657
2658 static void
2659 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
2660                              pixman_op_t              op,
2661                              uint32_t *               dst,
2662                              const uint32_t *         src,
2663                              const uint32_t *         mask,
2664                              int                      width)
2665 {
2666     core_combine_over_reverse_u_sse2 (dst, src, mask, width);
2667     _mm_empty ();
2668 }
2669
2670 static void
2671 sse2_combine_in_u (pixman_implementation_t *imp,
2672                    pixman_op_t              op,
2673                    uint32_t *               dst,
2674                    const uint32_t *         src,
2675                    const uint32_t *         mask,
2676                    int                      width)
2677 {
2678     core_combine_in_u_sse2 (dst, src, mask, width);
2679     _mm_empty ();
2680 }
2681
2682 static void
2683 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
2684                            pixman_op_t              op,
2685                            uint32_t *               dst,
2686                            const uint32_t *         src,
2687                            const uint32_t *         mask,
2688                            int                      width)
2689 {
2690     core_combine_reverse_in_u_sse2 (dst, src, mask, width);
2691     _mm_empty ();
2692 }
2693
2694 static void
2695 sse2_combine_out_u (pixman_implementation_t *imp,
2696                     pixman_op_t              op,
2697                     uint32_t *               dst,
2698                     const uint32_t *         src,
2699                     const uint32_t *         mask,
2700                     int                      width)
2701 {
2702     core_combine_out_u_sse2 (dst, src, mask, width);
2703     _mm_empty ();
2704 }
2705
2706 static void
2707 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
2708                             pixman_op_t              op,
2709                             uint32_t *               dst,
2710                             const uint32_t *         src,
2711                             const uint32_t *         mask,
2712                             int                      width)
2713 {
2714     core_combine_reverse_out_u_sse2 (dst, src, mask, width);
2715     _mm_empty ();
2716 }
2717
2718 static void
2719 sse2_combine_atop_u (pixman_implementation_t *imp,
2720                      pixman_op_t              op,
2721                      uint32_t *               dst,
2722                      const uint32_t *         src,
2723                      const uint32_t *         mask,
2724                      int                      width)
2725 {
2726     core_combine_atop_u_sse2 (dst, src, mask, width);
2727     _mm_empty ();
2728 }
2729
2730 static void
2731 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
2732                              pixman_op_t              op,
2733                              uint32_t *               dst,
2734                              const uint32_t *         src,
2735                              const uint32_t *         mask,
2736                              int                      width)
2737 {
2738     core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
2739     _mm_empty ();
2740 }
2741
2742 static void
2743 sse2_combine_xor_u (pixman_implementation_t *imp,
2744                     pixman_op_t              op,
2745                     uint32_t *               dst,
2746                     const uint32_t *         src,
2747                     const uint32_t *         mask,
2748                     int                      width)
2749 {
2750     core_combine_xor_u_sse2 (dst, src, mask, width);
2751     _mm_empty ();
2752 }
2753
2754 static void
2755 sse2_combine_add_u (pixman_implementation_t *imp,
2756                     pixman_op_t              op,
2757                     uint32_t *               dst,
2758                     const uint32_t *         src,
2759                     const uint32_t *         mask,
2760                     int                      width)
2761 {
2762     core_combine_add_u_sse2 (dst, src, mask, width);
2763     _mm_empty ();
2764 }
2765
2766 static void
2767 sse2_combine_saturate_u (pixman_implementation_t *imp,
2768                          pixman_op_t              op,
2769                          uint32_t *               dst,
2770                          const uint32_t *         src,
2771                          const uint32_t *         mask,
2772                          int                      width)
2773 {
2774     core_combine_saturate_u_sse2 (dst, src, mask, width);
2775     _mm_empty ();
2776 }
2777
2778 static void
2779 sse2_combine_src_ca (pixman_implementation_t *imp,
2780                      pixman_op_t              op,
2781                      uint32_t *               dst,
2782                      const uint32_t *         src,
2783                      const uint32_t *         mask,
2784                      int                      width)
2785 {
2786     core_combine_src_ca_sse2 (dst, src, mask, width);
2787     _mm_empty ();
2788 }
2789
2790 static void
2791 sse2_combine_over_ca (pixman_implementation_t *imp,
2792                       pixman_op_t              op,
2793                       uint32_t *               dst,
2794                       const uint32_t *         src,
2795                       const uint32_t *         mask,
2796                       int                      width)
2797 {
2798     core_combine_over_ca_sse2 (dst, src, mask, width);
2799     _mm_empty ();
2800 }
2801
2802 static void
2803 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
2804                               pixman_op_t              op,
2805                               uint32_t *               dst,
2806                               const uint32_t *         src,
2807                               const uint32_t *         mask,
2808                               int                      width)
2809 {
2810     core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
2811     _mm_empty ();
2812 }
2813
2814 static void
2815 sse2_combine_in_ca (pixman_implementation_t *imp,
2816                     pixman_op_t              op,
2817                     uint32_t *               dst,
2818                     const uint32_t *         src,
2819                     const uint32_t *         mask,
2820                     int                      width)
2821 {
2822     core_combine_in_ca_sse2 (dst, src, mask, width);
2823     _mm_empty ();
2824 }
2825
2826 static void
2827 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
2828                             pixman_op_t              op,
2829                             uint32_t *               dst,
2830                             const uint32_t *         src,
2831                             const uint32_t *         mask,
2832                             int                      width)
2833 {
2834     core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
2835     _mm_empty ();
2836 }
2837
2838 static void
2839 sse2_combine_out_ca (pixman_implementation_t *imp,
2840                      pixman_op_t              op,
2841                      uint32_t *               dst,
2842                      const uint32_t *         src,
2843                      const uint32_t *         mask,
2844                      int                      width)
2845 {
2846     core_combine_out_ca_sse2 (dst, src, mask, width);
2847     _mm_empty ();
2848 }
2849
2850 static void
2851 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
2852                              pixman_op_t              op,
2853                              uint32_t *               dst,
2854                              const uint32_t *         src,
2855                              const uint32_t *         mask,
2856                              int                      width)
2857 {
2858     core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
2859     _mm_empty ();
2860 }
2861
2862 static void
2863 sse2_combine_atop_ca (pixman_implementation_t *imp,
2864                       pixman_op_t              op,
2865                       uint32_t *               dst,
2866                       const uint32_t *         src,
2867                       const uint32_t *         mask,
2868                       int                      width)
2869 {
2870     core_combine_atop_ca_sse2 (dst, src, mask, width);
2871     _mm_empty ();
2872 }
2873
2874 static void
2875 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2876                               pixman_op_t              op,
2877                               uint32_t *               dst,
2878                               const uint32_t *         src,
2879                               const uint32_t *         mask,
2880                               int                      width)
2881 {
2882     core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
2883     _mm_empty ();
2884 }
2885
2886 static void
2887 sse2_combine_xor_ca (pixman_implementation_t *imp,
2888                      pixman_op_t              op,
2889                      uint32_t *               dst,
2890                      const uint32_t *         src,
2891                      const uint32_t *         mask,
2892                      int                      width)
2893 {
2894     core_combine_xor_ca_sse2 (dst, src, mask, width);
2895     _mm_empty ();
2896 }
2897
2898 static void
2899 sse2_combine_add_ca (pixman_implementation_t *imp,
2900                      pixman_op_t              op,
2901                      uint32_t *               dst,
2902                      const uint32_t *         src,
2903                      const uint32_t *         mask,
2904                      int                      width)
2905 {
2906     core_combine_add_ca_sse2 (dst, src, mask, width);
2907     _mm_empty ();
2908 }
2909
2910 /* -------------------------------------------------------------------
2911  * composite_over_n_8888
2912  */
2913
2914 static void
2915 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2916                             pixman_op_t              op,
2917                             pixman_image_t *         src_image,
2918                             pixman_image_t *         mask_image,
2919                             pixman_image_t *         dst_image,
2920                             int32_t                  src_x,
2921                             int32_t                  src_y,
2922                             int32_t                  mask_x,
2923                             int32_t                  mask_y,
2924                             int32_t                  dest_x,
2925                             int32_t                  dest_y,
2926                             int32_t                  width,
2927                             int32_t                  height)
2928 {
2929     uint32_t src;
2930     uint32_t    *dst_line, *dst, d;
2931     uint16_t w;
2932     int dst_stride;
2933     __m128i xmm_src, xmm_alpha;
2934     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2935
2936     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2937
2938     if (src == 0)
2939         return;
2940
2941     PIXMAN_IMAGE_GET_LINE (
2942         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2943
2944     xmm_src = expand_pixel_32_1x128 (src);
2945     xmm_alpha = expand_alpha_1x128 (xmm_src);
2946
2947     while (height--)
2948     {
2949         dst = dst_line;
2950
2951         /* call prefetch hint to optimize cache load*/
2952         cache_prefetch ((__m128i*)dst);
2953
2954         dst_line += dst_stride;
2955         w = width;
2956
2957         while (w && (unsigned long)dst & 15)
2958         {
2959             d = *dst;
2960             *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2961                                               _mm_movepi64_pi64 (xmm_alpha),
2962                                               unpack_32_1x64 (d)));
2963             w--;
2964         }
2965
2966         cache_prefetch ((__m128i*)dst);
2967
2968         while (w >= 4)
2969         {
2970             /* fill cache line with next memory */
2971             cache_prefetch_next ((__m128i*)dst);
2972
2973             xmm_dst = load_128_aligned ((__m128i*)dst);
2974
2975             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2976
2977             over_2x128 (&xmm_src, &xmm_src,
2978                         &xmm_alpha, &xmm_alpha,
2979                         &xmm_dst_lo, &xmm_dst_hi);
2980
2981             /* rebuid the 4 pixel data and save*/
2982             save_128_aligned (
2983                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2984
2985             w -= 4;
2986             dst += 4;
2987         }
2988
2989         while (w)
2990         {
2991             d = *dst;
2992             *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
2993                                               _mm_movepi64_pi64 (xmm_alpha),
2994                                               unpack_32_1x64 (d)));
2995             w--;
2996         }
2997
2998     }
2999     _mm_empty ();
3000 }
3001
3002 /* ---------------------------------------------------------------------
3003  * composite_over_n_0565
3004  */
3005 static void
3006 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
3007                             pixman_op_t              op,
3008                             pixman_image_t *         src_image,
3009                             pixman_image_t *         mask_image,
3010                             pixman_image_t *         dst_image,
3011                             int32_t                  src_x,
3012                             int32_t                  src_y,
3013                             int32_t                  mask_x,
3014                             int32_t                  mask_y,
3015                             int32_t                  dest_x,
3016                             int32_t                  dest_y,
3017                             int32_t                  width,
3018                             int32_t                  height)
3019 {
3020     uint32_t src;
3021     uint16_t    *dst_line, *dst, d;
3022     uint16_t w;
3023     int dst_stride;
3024     __m128i xmm_src, xmm_alpha;
3025     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3026
3027     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3028
3029     if (src == 0)
3030         return;
3031
3032     PIXMAN_IMAGE_GET_LINE (
3033         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3034
3035     xmm_src = expand_pixel_32_1x128 (src);
3036     xmm_alpha = expand_alpha_1x128 (xmm_src);
3037
3038     while (height--)
3039     {
3040         dst = dst_line;
3041
3042         /* call prefetch hint to optimize cache load*/
3043         cache_prefetch ((__m128i*)dst);
3044
3045         dst_line += dst_stride;
3046         w = width;
3047
3048         while (w && (unsigned long)dst & 15)
3049         {
3050             d = *dst;
3051
3052             *dst++ = pack_565_32_16 (
3053                 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3054                                          _mm_movepi64_pi64 (xmm_alpha),
3055                                          expand565_16_1x64 (d))));
3056             w--;
3057         }
3058
3059         /* call prefetch hint to optimize cache load*/
3060         cache_prefetch ((__m128i*)dst);
3061
3062         while (w >= 8)
3063         {
3064             /* fill cache line with next memory */
3065             cache_prefetch_next ((__m128i*)dst);
3066
3067             xmm_dst = load_128_aligned ((__m128i*)dst);
3068
3069             unpack_565_128_4x128 (xmm_dst,
3070                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3071
3072             over_2x128 (&xmm_src, &xmm_src,
3073                         &xmm_alpha, &xmm_alpha,
3074                         &xmm_dst0, &xmm_dst1);
3075             over_2x128 (&xmm_src, &xmm_src,
3076                         &xmm_alpha, &xmm_alpha,
3077                         &xmm_dst2, &xmm_dst3);
3078
3079             xmm_dst = pack_565_4x128_128 (
3080                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3081
3082             save_128_aligned ((__m128i*)dst, xmm_dst);
3083
3084             dst += 8;
3085             w -= 8;
3086         }
3087
3088         while (w--)
3089         {
3090             d = *dst;
3091             *dst++ = pack_565_32_16 (
3092                 pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
3093                                          _mm_movepi64_pi64 (xmm_alpha),
3094                                          expand565_16_1x64 (d))));
3095         }
3096     }
3097
3098     _mm_empty ();
3099 }
3100
3101 /* ------------------------------
3102  * composite_add_n_8888_8888_ca
3103  */
3104 static void
3105 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
3106                                    pixman_op_t              op,
3107                                    pixman_image_t *         src_image,
3108                                    pixman_image_t *         mask_image,
3109                                    pixman_image_t *         dst_image,
3110                                    int32_t                  src_x,
3111                                    int32_t                  src_y,
3112                                    int32_t                  mask_x,
3113                                    int32_t                  mask_y,
3114                                    int32_t                  dest_x,
3115                                    int32_t                  dest_y,
3116                                    int32_t                  width,
3117                                    int32_t                  height)
3118 {
3119     uint32_t src, srca;
3120     uint32_t    *dst_line, d;
3121     uint32_t    *mask_line, m;
3122     uint32_t pack_cmp;
3123     int dst_stride, mask_stride;
3124
3125     __m128i xmm_src, xmm_alpha;
3126     __m128i xmm_dst;
3127     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3128
3129     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3130
3131     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3132     srca = src >> 24;
3133     
3134     if (src == 0)
3135         return;
3136
3137     PIXMAN_IMAGE_GET_LINE (
3138         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3139     PIXMAN_IMAGE_GET_LINE (
3140         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3141
3142     xmm_src = _mm_unpacklo_epi8 (
3143         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3144     xmm_alpha = expand_alpha_1x128 (xmm_src);
3145     mmx_src   = _mm_movepi64_pi64 (xmm_src);
3146     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3147
3148     while (height--)
3149     {
3150         int w = width;
3151         const uint32_t *pm = (uint32_t *)mask_line;
3152         uint32_t *pd = (uint32_t *)dst_line;
3153
3154         dst_line += dst_stride;
3155         mask_line += mask_stride;
3156
3157         /* call prefetch hint to optimize cache load*/
3158         cache_prefetch ((__m128i*)pd);
3159         cache_prefetch ((__m128i*)pm);
3160
3161         while (w && (unsigned long)pd & 15)
3162         {
3163             m = *pm++;
3164
3165             if (m)
3166             {
3167                 d = *pd;
3168                 
3169                 mmx_mask = unpack_32_1x64 (m);
3170                 mmx_dest = unpack_32_1x64 (d);
3171
3172                 *pd = pack_1x64_32 (
3173                     _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
3174             }
3175
3176             pd++;
3177             w--;
3178         }
3179
3180         /* call prefetch hint to optimize cache load*/
3181         cache_prefetch ((__m128i*)pd);
3182         cache_prefetch ((__m128i*)pm);
3183
3184         while (w >= 4)
3185         {
3186             /* fill cache line with next memory */
3187             cache_prefetch_next ((__m128i*)pd);
3188             cache_prefetch_next ((__m128i*)pm);
3189
3190             xmm_mask = load_128_unaligned ((__m128i*)pm);
3191
3192             pack_cmp =
3193                 _mm_movemask_epi8 (
3194                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3195
3196             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3197             if (pack_cmp != 0xffff)
3198             {
3199                 xmm_dst = load_128_aligned ((__m128i*)pd);
3200
3201                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3202
3203                 pix_multiply_2x128 (&xmm_src, &xmm_src,
3204                                     &xmm_mask_lo, &xmm_mask_hi,
3205                                     &xmm_mask_lo, &xmm_mask_hi);
3206                 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
3207                 
3208                 save_128_aligned (
3209                     (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
3210             }
3211
3212             pd += 4;
3213             pm += 4;
3214             w -= 4;
3215         }
3216
3217         while (w)
3218         {
3219             m = *pm++;
3220
3221             if (m)
3222             {
3223                 d = *pd;
3224                 
3225                 mmx_mask = unpack_32_1x64 (m);
3226                 mmx_dest = unpack_32_1x64 (d);
3227
3228                 *pd = pack_1x64_32 (
3229                     _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
3230             }
3231
3232             pd++;
3233             w--;
3234         }
3235     }
3236
3237     _mm_empty ();
3238 }
3239
3240 /* ---------------------------------------------------------------------------
3241  * composite_over_n_8888_8888_ca
3242  */
3243
3244 static void
3245 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
3246                                     pixman_op_t              op,
3247                                     pixman_image_t *         src_image,
3248                                     pixman_image_t *         mask_image,
3249                                     pixman_image_t *         dst_image,
3250                                     int32_t                  src_x,
3251                                     int32_t                  src_y,
3252                                     int32_t                  mask_x,
3253                                     int32_t                  mask_y,
3254                                     int32_t                  dest_x,
3255                                     int32_t                  dest_y,
3256                                     int32_t                  width,
3257                                     int32_t                  height)
3258 {
3259     uint32_t src;
3260     uint32_t    *dst_line, d;
3261     uint32_t    *mask_line, m;
3262     uint32_t pack_cmp;
3263     int dst_stride, mask_stride;
3264
3265     __m128i xmm_src, xmm_alpha;
3266     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3267     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3268
3269     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3270
3271     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3272
3273     if (src == 0)
3274         return;
3275
3276     PIXMAN_IMAGE_GET_LINE (
3277         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3278     PIXMAN_IMAGE_GET_LINE (
3279         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3280
3281     xmm_src = _mm_unpacklo_epi8 (
3282         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
3283     xmm_alpha = expand_alpha_1x128 (xmm_src);
3284     mmx_src   = _mm_movepi64_pi64 (xmm_src);
3285     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3286
3287     while (height--)
3288     {
3289         int w = width;
3290         const uint32_t *pm = (uint32_t *)mask_line;
3291         uint32_t *pd = (uint32_t *)dst_line;
3292
3293         dst_line += dst_stride;
3294         mask_line += mask_stride;
3295
3296         /* call prefetch hint to optimize cache load*/
3297         cache_prefetch ((__m128i*)pd);
3298         cache_prefetch ((__m128i*)pm);
3299
3300         while (w && (unsigned long)pd & 15)
3301         {
3302             m = *pm++;
3303
3304             if (m)
3305             {
3306                 d = *pd;
3307                 mmx_mask = unpack_32_1x64 (m);
3308                 mmx_dest = unpack_32_1x64 (d);
3309
3310                 *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
3311                                                   &mmx_alpha,
3312                                                   &mmx_mask,
3313                                                   &mmx_dest));
3314             }
3315
3316             pd++;
3317             w--;
3318         }
3319
3320         /* call prefetch hint to optimize cache load*/
3321         cache_prefetch ((__m128i*)pd);
3322         cache_prefetch ((__m128i*)pm);
3323
3324         while (w >= 4)
3325         {
3326             /* fill cache line with next memory */
3327             cache_prefetch_next ((__m128i*)pd);
3328             cache_prefetch_next ((__m128i*)pm);
3329
3330             xmm_mask = load_128_unaligned ((__m128i*)pm);
3331
3332             pack_cmp =
3333                 _mm_movemask_epi8 (
3334                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3335
3336             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
3337             if (pack_cmp != 0xffff)
3338             {
3339                 xmm_dst = load_128_aligned ((__m128i*)pd);
3340
3341                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3342                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3343
3344                 in_over_2x128 (&xmm_src, &xmm_src,
3345                                &xmm_alpha, &xmm_alpha,
3346                                &xmm_mask_lo, &xmm_mask_hi,
3347                                &xmm_dst_lo, &xmm_dst_hi);
3348
3349                 save_128_aligned (
3350                     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3351             }
3352
3353             pd += 4;
3354             pm += 4;
3355             w -= 4;
3356         }
3357
3358         while (w)
3359         {
3360             m = *pm++;
3361
3362             if (m)
3363             {
3364                 d = *pd;
3365                 mmx_mask = unpack_32_1x64 (m);
3366                 mmx_dest = unpack_32_1x64 (d);
3367
3368                 *pd = pack_1x64_32 (
3369                     in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
3370             }
3371
3372             pd++;
3373             w--;
3374         }
3375     }
3376
3377     _mm_empty ();
3378 }
3379
3380 /*---------------------------------------------------------------------
3381  * composite_over_8888_n_8888
3382  */
3383
3384 static void
3385 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
3386                                  pixman_op_t              op,
3387                                  pixman_image_t *         src_image,
3388                                  pixman_image_t *         mask_image,
3389                                  pixman_image_t *         dst_image,
3390                                  int32_t                  src_x,
3391                                  int32_t                  src_y,
3392                                  int32_t                  mask_x,
3393                                  int32_t                  mask_y,
3394                                  int32_t                  dest_x,
3395                                  int32_t                  dest_y,
3396                                  int32_t                  width,
3397                                  int32_t                  height)
3398 {
3399     uint32_t    *dst_line, *dst;
3400     uint32_t    *src_line, *src;
3401     uint32_t mask;
3402     uint16_t w;
3403     int dst_stride, src_stride;
3404
3405     __m128i xmm_mask;
3406     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3407     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3408     __m128i xmm_alpha_lo, xmm_alpha_hi;
3409
3410     PIXMAN_IMAGE_GET_LINE (
3411         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3412     PIXMAN_IMAGE_GET_LINE (
3413         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3414
3415     mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
3416
3417     xmm_mask = create_mask_16_128 (mask >> 24);
3418
3419     while (height--)
3420     {
3421         dst = dst_line;
3422         dst_line += dst_stride;
3423         src = src_line;
3424         src_line += src_stride;
3425         w = width;
3426
3427         /* call prefetch hint to optimize cache load*/
3428         cache_prefetch ((__m128i*)dst);
3429         cache_prefetch ((__m128i*)src);
3430
3431         while (w && (unsigned long)dst & 15)
3432         {
3433             uint32_t s = *src++;
3434             uint32_t d = *dst;
3435
3436             __m64 ms = unpack_32_1x64 (s);
3437             __m64 alpha    = expand_alpha_1x64 (ms);
3438             __m64 dest     = _mm_movepi64_pi64 (xmm_mask);
3439             __m64 alpha_dst = unpack_32_1x64 (d);
3440
3441             *dst++ = pack_1x64_32 (
3442                 in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
3443
3444             w--;
3445         }
3446
3447         /* call prefetch hint to optimize cache load*/
3448         cache_prefetch ((__m128i*)dst);
3449         cache_prefetch ((__m128i*)src);
3450
3451         while (w >= 4)
3452         {
3453             /* fill cache line with next memory */
3454             cache_prefetch_next ((__m128i*)dst);
3455             cache_prefetch_next ((__m128i*)src);
3456
3457             xmm_src = load_128_unaligned ((__m128i*)src);
3458             xmm_dst = load_128_aligned ((__m128i*)dst);
3459
3460             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3461             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3462             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3463                                 &xmm_alpha_lo, &xmm_alpha_hi);
3464
3465             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3466                            &xmm_alpha_lo, &xmm_alpha_hi,
3467                            &xmm_mask, &xmm_mask,
3468                            &xmm_dst_lo, &xmm_dst_hi);
3469
3470             save_128_aligned (
3471                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3472
3473             dst += 4;
3474             src += 4;
3475             w -= 4;
3476         }
3477
3478         while (w)
3479         {
3480             uint32_t s = *src++;
3481             uint32_t d = *dst;
3482
3483             __m64 ms = unpack_32_1x64 (s);
3484             __m64 alpha = expand_alpha_1x64 (ms);
3485             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3486             __m64 dest  = unpack_32_1x64 (d);
3487
3488             *dst++ = pack_1x64_32 (
3489                 in_over_1x64 (&ms, &alpha, &mask, &dest));
3490
3491             w--;
3492         }
3493     }
3494
3495     _mm_empty ();
3496 }
3497
3498 /* ---------------------------------------------------------------------
3499  * composite_over_x888_n_8888
3500  */
3501 static void
3502 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3503                                  pixman_op_t              op,
3504                                  pixman_image_t *         src_image,
3505                                  pixman_image_t *         mask_image,
3506                                  pixman_image_t *         dst_image,
3507                                  int32_t                  src_x,
3508                                  int32_t                  src_y,
3509                                  int32_t                  mask_x,
3510                                  int32_t                  mask_y,
3511                                  int32_t                  dest_x,
3512                                  int32_t                  dest_y,
3513                                  int32_t                  width,
3514                                  int32_t                  height)
3515 {
3516     uint32_t    *dst_line, *dst;
3517     uint32_t    *src_line, *src;
3518     uint32_t mask;
3519     int dst_stride, src_stride;
3520     uint16_t w;
3521
3522     __m128i xmm_mask, xmm_alpha;
3523     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3524     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3525
3526     PIXMAN_IMAGE_GET_LINE (
3527         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3528     PIXMAN_IMAGE_GET_LINE (
3529         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3530
3531     mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
3532
3533     xmm_mask = create_mask_16_128 (mask >> 24);
3534     xmm_alpha = mask_00ff;
3535
3536     while (height--)
3537     {
3538         dst = dst_line;
3539         dst_line += dst_stride;
3540         src = src_line;
3541         src_line += src_stride;
3542         w = width;
3543
3544         /* call prefetch hint to optimize cache load*/
3545         cache_prefetch ((__m128i*)dst);
3546         cache_prefetch ((__m128i*)src);
3547
3548         while (w && (unsigned long)dst & 15)
3549         {
3550             uint32_t s = (*src++) | 0xff000000;
3551             uint32_t d = *dst;
3552
3553             __m64 src   = unpack_32_1x64 (s);
3554             __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3555             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3556             __m64 dest  = unpack_32_1x64 (d);
3557
3558             *dst++ = pack_1x64_32 (
3559                 in_over_1x64 (&src, &alpha, &mask, &dest));
3560
3561             w--;
3562         }
3563
3564         /* call prefetch hint to optimize cache load*/
3565         cache_prefetch ((__m128i*)dst);
3566         cache_prefetch ((__m128i*)src);
3567
3568         while (w >= 4)
3569         {
3570             /* fill cache line with next memory */
3571             cache_prefetch_next ((__m128i*)dst);
3572             cache_prefetch_next ((__m128i*)src);
3573
3574             xmm_src = _mm_or_si128 (
3575                 load_128_unaligned ((__m128i*)src), mask_ff000000);
3576             xmm_dst = load_128_aligned ((__m128i*)dst);
3577
3578             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3579             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3580
3581             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3582                            &xmm_alpha, &xmm_alpha,
3583                            &xmm_mask, &xmm_mask,
3584                            &xmm_dst_lo, &xmm_dst_hi);
3585
3586             save_128_aligned (
3587                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3588
3589             dst += 4;
3590             src += 4;
3591             w -= 4;
3592
3593         }
3594
3595         while (w)
3596         {
3597             uint32_t s = (*src++) | 0xff000000;
3598             uint32_t d = *dst;
3599
3600             __m64 src  = unpack_32_1x64 (s);
3601             __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
3602             __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
3603             __m64 dest  = unpack_32_1x64 (d);
3604
3605             *dst++ = pack_1x64_32 (
3606                 in_over_1x64 (&src, &alpha, &mask, &dest));
3607
3608             w--;
3609         }
3610     }
3611
3612     _mm_empty ();
3613 }
3614
3615 /* --------------------------------------------------------------------
3616  * composite_over_8888_8888
3617  */
3618 static void
3619 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3620                                pixman_op_t              op,
3621                                pixman_image_t *         src_image,
3622                                pixman_image_t *         mask_image,
3623                                pixman_image_t *         dst_image,
3624                                int32_t                  src_x,
3625                                int32_t                  src_y,
3626                                int32_t                  mask_x,
3627                                int32_t                  mask_y,
3628                                int32_t                  dest_x,
3629                                int32_t                  dest_y,
3630                                int32_t                  width,
3631                                int32_t                  height)
3632 {
3633     int dst_stride, src_stride;
3634     uint32_t    *dst_line, *dst;
3635     uint32_t    *src_line, *src;
3636
3637     PIXMAN_IMAGE_GET_LINE (
3638         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3639     PIXMAN_IMAGE_GET_LINE (
3640         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3641
3642     dst = dst_line;
3643     src = src_line;
3644
3645     while (height--)
3646     {
3647         core_combine_over_u_sse2 (dst, src, NULL, width);
3648
3649         dst += dst_stride;
3650         src += src_stride;
3651     }
3652     _mm_empty ();
3653 }
3654
3655 /* ------------------------------------------------------------------
3656  * composite_over_8888_0565
3657  */
3658 static force_inline uint16_t
3659 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3660 {
3661     __m64 ms;
3662
3663     ms = unpack_32_1x64 (src);
3664     return pack_565_32_16 (
3665         pack_1x64_32 (
3666             over_1x64 (
3667                 ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
3668 }
3669
3670 static void
3671 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3672                                pixman_op_t              op,
3673                                pixman_image_t *         src_image,
3674                                pixman_image_t *         mask_image,
3675                                pixman_image_t *         dst_image,
3676                                int32_t                  src_x,
3677                                int32_t                  src_y,
3678                                int32_t                  mask_x,
3679                                int32_t                  mask_y,
3680                                int32_t                  dest_x,
3681                                int32_t                  dest_y,
3682                                int32_t                  width,
3683                                int32_t                  height)
3684 {
3685     uint16_t    *dst_line, *dst, d;
3686     uint32_t    *src_line, *src, s;
3687     int dst_stride, src_stride;
3688     uint16_t w;
3689
3690     __m128i xmm_alpha_lo, xmm_alpha_hi;
3691     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3692     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3693
3694     PIXMAN_IMAGE_GET_LINE (
3695         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3696     PIXMAN_IMAGE_GET_LINE (
3697         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3698
3699 #if 0
3700     /* FIXME
3701      *
3702      * I copy the code from MMX one and keep the fixme.
3703      * If it's a problem there, probably is a problem here.
3704      */
3705     assert (src_image->drawable == mask_image->drawable);
3706 #endif
3707
3708     while (height--)
3709     {
3710         dst = dst_line;
3711         src = src_line;
3712
3713         /* call prefetch hint to optimize cache load*/
3714         cache_prefetch ((__m128i*)src);
3715         cache_prefetch ((__m128i*)dst);
3716
3717         dst_line += dst_stride;
3718         src_line += src_stride;
3719         w = width;
3720
3721         /* Align dst on a 16-byte boundary */
3722         while (w &&
3723                ((unsigned long)dst & 15))
3724         {
3725             s = *src++;
3726             d = *dst;
3727
3728             *dst++ = composite_over_8888_0565pixel (s, d);
3729             w--;
3730         }
3731
3732         /* call prefetch hint to optimize cache load*/
3733         cache_prefetch ((__m128i*)src);
3734         cache_prefetch ((__m128i*)dst);
3735
3736         /* It's a 8 pixel loop */
3737         while (w >= 8)
3738         {
3739             /* fill cache line with next memory */
3740             cache_prefetch_next ((__m128i*)src);
3741             cache_prefetch_next ((__m128i*)dst);
3742
3743             /* I'm loading unaligned because I'm not sure
3744              * about the address alignment.
3745              */
3746             xmm_src = load_128_unaligned ((__m128i*) src);
3747             xmm_dst = load_128_aligned ((__m128i*) dst);
3748
3749             /* Unpacking */
3750             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3751             unpack_565_128_4x128 (xmm_dst,
3752                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3753             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3754                                 &xmm_alpha_lo, &xmm_alpha_hi);
3755
3756             /* I'm loading next 4 pixels from memory
3757              * before to optimze the memory read.
3758              */
3759             xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3760
3761             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3762                         &xmm_alpha_lo, &xmm_alpha_hi,
3763                         &xmm_dst0, &xmm_dst1);
3764
3765             /* Unpacking */
3766             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3767             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3768                                 &xmm_alpha_lo, &xmm_alpha_hi);
3769
3770             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3771                         &xmm_alpha_lo, &xmm_alpha_hi,
3772                         &xmm_dst2, &xmm_dst3);
3773
3774             save_128_aligned (
3775                 (__m128i*)dst, pack_565_4x128_128 (
3776                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3777
3778             w -= 8;
3779             dst += 8;
3780             src += 8;
3781         }
3782
3783         while (w--)
3784         {
3785             s = *src++;
3786             d = *dst;
3787
3788             *dst++ = composite_over_8888_0565pixel (s, d);
3789         }
3790     }
3791
3792     _mm_empty ();
3793 }
3794
3795 /* -----------------------------------------------------------------
3796  * composite_over_n_8_8888
3797  */
3798
3799 static void
3800 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3801                               pixman_op_t              op,
3802                               pixman_image_t *         src_image,
3803                               pixman_image_t *         mask_image,
3804                               pixman_image_t *         dst_image,
3805                               int32_t                  src_x,
3806                               int32_t                  src_y,
3807                               int32_t                  mask_x,
3808                               int32_t                  mask_y,
3809                               int32_t                  dest_x,
3810                               int32_t                  dest_y,
3811                               int32_t                  width,
3812                               int32_t                  height)
3813 {
3814     uint32_t src, srca;
3815     uint32_t *dst_line, *dst;
3816     uint8_t *mask_line, *mask;
3817     int dst_stride, mask_stride;
3818     uint16_t w;
3819     uint32_t m, d;
3820
3821     __m128i xmm_src, xmm_alpha, xmm_def;
3822     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3823     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3824
3825     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3826
3827     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
3828
3829     srca = src >> 24;
3830     if (src == 0)
3831         return;
3832
3833     PIXMAN_IMAGE_GET_LINE (
3834         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3835     PIXMAN_IMAGE_GET_LINE (
3836         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3837
3838     xmm_def = create_mask_2x32_128 (src, src);
3839     xmm_src = expand_pixel_32_1x128 (src);
3840     xmm_alpha = expand_alpha_1x128 (xmm_src);
3841     mmx_src   = _mm_movepi64_pi64 (xmm_src);
3842     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
3843
3844     while (height--)
3845     {
3846         dst = dst_line;
3847         dst_line += dst_stride;
3848         mask = mask_line;
3849         mask_line += mask_stride;
3850         w = width;
3851
3852         /* call prefetch hint to optimize cache load*/
3853         cache_prefetch ((__m128i*)mask);
3854         cache_prefetch ((__m128i*)dst);
3855
3856         while (w && (unsigned long)dst & 15)
3857         {
3858             uint8_t m = *mask++;
3859
3860             if (m)
3861             {
3862                 d = *dst;
3863                 mmx_mask = expand_pixel_8_1x64 (m);
3864                 mmx_dest = unpack_32_1x64 (d);
3865
3866                 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3867                                                    &mmx_alpha,
3868                                                    &mmx_mask,
3869                                                    &mmx_dest));
3870             }
3871
3872             w--;
3873             dst++;
3874         }
3875
3876         /* call prefetch hint to optimize cache load*/
3877         cache_prefetch ((__m128i*)mask);
3878         cache_prefetch ((__m128i*)dst);
3879
3880         while (w >= 4)
3881         {
3882             /* fill cache line with next memory */
3883             cache_prefetch_next ((__m128i*)mask);
3884             cache_prefetch_next ((__m128i*)dst);
3885
3886             m = *((uint32_t*)mask);
3887
3888             if (srca == 0xff && m == 0xffffffff)
3889             {
3890                 save_128_aligned ((__m128i*)dst, xmm_def);
3891             }
3892             else if (m)
3893             {
3894                 xmm_dst = load_128_aligned ((__m128i*) dst);
3895                 xmm_mask = unpack_32_1x128 (m);
3896                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3897
3898                 /* Unpacking */
3899                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3900                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3901
3902                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3903                                         &xmm_mask_lo, &xmm_mask_hi);
3904
3905                 in_over_2x128 (&xmm_src, &xmm_src,
3906                                &xmm_alpha, &xmm_alpha,
3907                                &xmm_mask_lo, &xmm_mask_hi,
3908                                &xmm_dst_lo, &xmm_dst_hi);
3909
3910                 save_128_aligned (
3911                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3912             }
3913
3914             w -= 4;
3915             dst += 4;
3916             mask += 4;
3917         }
3918
3919         while (w)
3920         {
3921             uint8_t m = *mask++;
3922
3923             if (m)
3924             {
3925                 d = *dst;
3926                 mmx_mask = expand_pixel_8_1x64 (m);
3927                 mmx_dest = unpack_32_1x64 (d);
3928
3929                 *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
3930                                                    &mmx_alpha,
3931                                                    &mmx_mask,
3932                                                    &mmx_dest));
3933             }
3934
3935             w--;
3936             dst++;
3937         }
3938     }
3939
3940     _mm_empty ();
3941 }
3942
3943 /* ----------------------------------------------------------------
3944  * composite_over_n_8_8888
3945  */
3946
3947 pixman_bool_t
3948 pixman_fill_sse2 (uint32_t *bits,
3949                   int       stride,
3950                   int       bpp,
3951                   int       x,
3952                   int       y,
3953                   int       width,
3954                   int       height,
3955                   uint32_t  data)
3956 {
3957     uint32_t byte_width;
3958     uint8_t         *byte_line;
3959
3960     __m128i xmm_def;
3961
3962     if (bpp == 16 && (data >> 16 != (data & 0xffff)))
3963         return FALSE;
3964
3965     if (bpp != 16 && bpp != 32)
3966         return FALSE;
3967
3968     if (bpp == 16)
3969     {
3970         stride = stride * (int) sizeof (uint32_t) / 2;
3971         byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3972         byte_width = 2 * width;
3973         stride *= 2;
3974     }
3975     else
3976     {
3977         stride = stride * (int) sizeof (uint32_t) / 4;
3978         byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3979         byte_width = 4 * width;
3980         stride *= 4;
3981     }
3982
3983     cache_prefetch ((__m128i*)byte_line);
3984     xmm_def = create_mask_2x32_128 (data, data);
3985
3986     while (height--)
3987     {
3988         int w;
3989         uint8_t *d = byte_line;
3990         byte_line += stride;
3991         w = byte_width;
3992
3993
3994         cache_prefetch_next ((__m128i*)d);
3995
3996         while (w >= 2 && ((unsigned long)d & 3))
3997         {
3998             *(uint16_t *)d = data;
3999             w -= 2;
4000             d += 2;
4001         }
4002
4003         while (w >= 4 && ((unsigned long)d & 15))
4004         {
4005             *(uint32_t *)d = data;
4006
4007             w -= 4;
4008             d += 4;
4009         }
4010
4011         cache_prefetch_next ((__m128i*)d);
4012
4013         while (w >= 128)
4014         {
4015             cache_prefetch (((__m128i*)d) + 12);
4016
4017             save_128_aligned ((__m128i*)(d),     xmm_def);
4018             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
4019             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
4020             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
4021             save_128_aligned ((__m128i*)(d + 64),  xmm_def);
4022             save_128_aligned ((__m128i*)(d + 80),  xmm_def);
4023             save_128_aligned ((__m128i*)(d + 96),  xmm_def);
4024             save_128_aligned ((__m128i*)(d + 112), xmm_def);
4025
4026             d += 128;
4027             w -= 128;
4028         }
4029
4030         if (w >= 64)
4031         {
4032             cache_prefetch (((__m128i*)d) + 8);
4033
4034             save_128_aligned ((__m128i*)(d),     xmm_def);
4035             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
4036             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
4037             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
4038
4039             d += 64;
4040             w -= 64;
4041         }
4042
4043         cache_prefetch_next ((__m128i*)d);
4044
4045         if (w >= 32)
4046         {
4047             save_128_aligned ((__m128i*)(d),     xmm_def);
4048             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
4049
4050             d += 32;
4051             w -= 32;
4052         }
4053
4054         if (w >= 16)
4055         {
4056             save_128_aligned ((__m128i*)(d),     xmm_def);
4057
4058             d += 16;
4059             w -= 16;
4060         }
4061
4062         cache_prefetch_next ((__m128i*)d);
4063
4064         while (w >= 4)
4065         {
4066             *(uint32_t *)d = data;
4067
4068             w -= 4;
4069             d += 4;
4070         }
4071
4072         if (w >= 2)
4073         {
4074             *(uint16_t *)d = data;
4075             w -= 2;
4076             d += 2;
4077         }
4078     }
4079
4080     _mm_empty ();
4081     return TRUE;
4082 }
4083
4084 static void
4085 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
4086                              pixman_op_t              op,
4087                              pixman_image_t *         src_image,
4088                              pixman_image_t *         mask_image,
4089                              pixman_image_t *         dst_image,
4090                              int32_t                  src_x,
4091                              int32_t                  src_y,
4092                              int32_t                  mask_x,
4093                              int32_t                  mask_y,
4094                              int32_t                  dest_x,
4095                              int32_t                  dest_y,
4096                              int32_t                  width,
4097                              int32_t                  height)
4098 {
4099     uint32_t src, srca;
4100     uint32_t    *dst_line, *dst;
4101     uint8_t     *mask_line, *mask;
4102     int dst_stride, mask_stride;
4103     uint16_t w;
4104     uint32_t m;
4105
4106     __m128i xmm_src, xmm_def;
4107     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4108
4109     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4110
4111     srca = src >> 24;
4112     if (src == 0)
4113     {
4114         pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
4115                           PIXMAN_FORMAT_BPP (dst_image->bits.format),
4116                           dest_x, dest_y, width, height, 0);
4117         return;
4118     }
4119
4120     PIXMAN_IMAGE_GET_LINE (
4121         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4122     PIXMAN_IMAGE_GET_LINE (
4123         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4124
4125     xmm_def = create_mask_2x32_128 (src, src);
4126     xmm_src = expand_pixel_32_1x128 (src);
4127
4128     while (height--)
4129     {
4130         dst = dst_line;
4131         dst_line += dst_stride;
4132         mask = mask_line;
4133         mask_line += mask_stride;
4134         w = width;
4135
4136         /* call prefetch hint to optimize cache load*/
4137         cache_prefetch ((__m128i*)mask);
4138         cache_prefetch ((__m128i*)dst);
4139
4140         while (w && (unsigned long)dst & 15)
4141         {
4142             uint8_t m = *mask++;
4143
4144             if (m)
4145             {
4146                 *dst = pack_1x64_32 (
4147                     pix_multiply_1x64 (
4148                         _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4149             }
4150             else
4151             {
4152                 *dst = 0;
4153             }
4154
4155             w--;
4156             dst++;
4157         }
4158
4159         /* call prefetch hint to optimize cache load*/
4160         cache_prefetch ((__m128i*)mask);
4161         cache_prefetch ((__m128i*)dst);
4162
4163         while (w >= 4)
4164         {
4165             /* fill cache line with next memory */
4166             cache_prefetch_next ((__m128i*)mask);
4167             cache_prefetch_next ((__m128i*)dst);
4168
4169             m = *((uint32_t*)mask);
4170
4171             if (srca == 0xff && m == 0xffffffff)
4172             {
4173                 save_128_aligned ((__m128i*)dst, xmm_def);
4174             }
4175             else if (m)
4176             {
4177                 xmm_mask = unpack_32_1x128 (m);
4178                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4179
4180                 /* Unpacking */
4181                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4182
4183                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4184                                         &xmm_mask_lo, &xmm_mask_hi);
4185
4186                 pix_multiply_2x128 (&xmm_src, &xmm_src,
4187                                     &xmm_mask_lo, &xmm_mask_hi,
4188                                     &xmm_mask_lo, &xmm_mask_hi);
4189
4190                 save_128_aligned (
4191                     (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
4192             }
4193             else
4194             {
4195                 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
4196             }
4197
4198             w -= 4;
4199             dst += 4;
4200             mask += 4;
4201         }
4202
4203         while (w)
4204         {
4205             uint8_t m = *mask++;
4206
4207             if (m)
4208             {
4209                 *dst = pack_1x64_32 (
4210                     pix_multiply_1x64 (
4211                         _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
4212             }
4213             else
4214             {
4215                 *dst = 0;
4216             }
4217
4218             w--;
4219             dst++;
4220         }
4221     }
4222
4223     _mm_empty ();
4224 }
4225
4226 /*-----------------------------------------------------------------------
4227  * composite_over_n_8_0565
4228  */
4229
4230 static void
4231 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
4232                               pixman_op_t              op,
4233                               pixman_image_t *         src_image,
4234                               pixman_image_t *         mask_image,
4235                               pixman_image_t *         dst_image,
4236                               int32_t                  src_x,
4237                               int32_t                  src_y,
4238                               int32_t                  mask_x,
4239                               int32_t                  mask_y,
4240                               int32_t                  dest_x,
4241                               int32_t                  dest_y,
4242                               int32_t                  width,
4243                               int32_t                  height)
4244 {
4245     uint32_t src, srca;
4246     uint16_t    *dst_line, *dst, d;
4247     uint8_t     *mask_line, *mask;
4248     int dst_stride, mask_stride;
4249     uint16_t w;
4250     uint32_t m;
4251     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4252
4253     __m128i xmm_src, xmm_alpha;
4254     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4255     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4256
4257     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4258
4259     srca = src >> 24;
4260     if (src == 0)
4261         return;
4262
4263     PIXMAN_IMAGE_GET_LINE (
4264         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4265     PIXMAN_IMAGE_GET_LINE (
4266         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4267
4268     xmm_src = expand_pixel_32_1x128 (src);
4269     xmm_alpha = expand_alpha_1x128 (xmm_src);
4270     mmx_src = _mm_movepi64_pi64 (xmm_src);
4271     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4272
4273     while (height--)
4274     {
4275         dst = dst_line;
4276         dst_line += dst_stride;
4277         mask = mask_line;
4278         mask_line += mask_stride;
4279         w = width;
4280
4281         /* call prefetch hint to optimize cache load*/
4282         cache_prefetch ((__m128i*)mask);
4283         cache_prefetch ((__m128i*)dst);
4284
4285         while (w && (unsigned long)dst & 15)
4286         {
4287             m = *mask++;
4288
4289             if (m)
4290             {
4291                 d = *dst;
4292                 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4293                 mmx_dest = expand565_16_1x64 (d);
4294
4295                 *dst = pack_565_32_16 (
4296                     pack_1x64_32 (
4297                         in_over_1x64 (
4298                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4299             }
4300
4301             w--;
4302             dst++;
4303         }
4304
4305         /* call prefetch hint to optimize cache load*/
4306         cache_prefetch ((__m128i*)mask);
4307         cache_prefetch ((__m128i*)dst);
4308
4309         while (w >= 8)
4310         {
4311             /* fill cache line with next memory */
4312             cache_prefetch_next ((__m128i*)mask);
4313             cache_prefetch_next ((__m128i*)dst);
4314
4315             xmm_dst = load_128_aligned ((__m128i*) dst);
4316             unpack_565_128_4x128 (xmm_dst,
4317                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4318
4319             m = *((uint32_t*)mask);
4320             mask += 4;
4321
4322             if (m)
4323             {
4324                 xmm_mask = unpack_32_1x128 (m);
4325                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4326
4327                 /* Unpacking */
4328                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4329
4330                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4331                                         &xmm_mask_lo, &xmm_mask_hi);
4332
4333                 in_over_2x128 (&xmm_src, &xmm_src,
4334                                &xmm_alpha, &xmm_alpha,
4335                                &xmm_mask_lo, &xmm_mask_hi,
4336                                &xmm_dst0, &xmm_dst1);
4337             }
4338
4339             m = *((uint32_t*)mask);
4340             mask += 4;
4341
4342             if (m)
4343             {
4344                 xmm_mask = unpack_32_1x128 (m);
4345                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
4346
4347                 /* Unpacking */
4348                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4349
4350                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4351                                         &xmm_mask_lo, &xmm_mask_hi);
4352                 in_over_2x128 (&xmm_src, &xmm_src,
4353                                &xmm_alpha, &xmm_alpha,
4354                                &xmm_mask_lo, &xmm_mask_hi,
4355                                &xmm_dst2, &xmm_dst3);
4356             }
4357
4358             save_128_aligned (
4359                 (__m128i*)dst, pack_565_4x128_128 (
4360                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4361
4362             w -= 8;
4363             dst += 8;
4364         }
4365
4366         while (w)
4367         {
4368             m = *mask++;
4369
4370             if (m)
4371             {
4372                 d = *dst;
4373                 mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
4374                 mmx_dest = expand565_16_1x64 (d);
4375
4376                 *dst = pack_565_32_16 (
4377                     pack_1x64_32 (
4378                         in_over_1x64 (
4379                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4380             }
4381
4382             w--;
4383             dst++;
4384         }
4385     }
4386
4387     _mm_empty ();
4388 }
4389
4390 /* -----------------------------------------------------------------------
4391  * composite_over_pixbuf_0565
4392  */
4393
4394 static void
4395 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
4396                                  pixman_op_t              op,
4397                                  pixman_image_t *         src_image,
4398                                  pixman_image_t *         mask_image,
4399                                  pixman_image_t *         dst_image,
4400                                  int32_t                  src_x,
4401                                  int32_t                  src_y,
4402                                  int32_t                  mask_x,
4403                                  int32_t                  mask_y,
4404                                  int32_t                  dest_x,
4405                                  int32_t                  dest_y,
4406                                  int32_t                  width,
4407                                  int32_t                  height)
4408 {
4409     uint16_t    *dst_line, *dst, d;
4410     uint32_t    *src_line, *src, s;
4411     int dst_stride, src_stride;
4412     uint16_t w;
4413     uint32_t opaque, zero;
4414
4415     __m64 ms;
4416     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4417     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4418
4419     PIXMAN_IMAGE_GET_LINE (
4420         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4421     PIXMAN_IMAGE_GET_LINE (
4422         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4423
4424 #if 0
4425     /* FIXME
4426      *
4427      * I copy the code from MMX one and keep the fixme.
4428      * If it's a problem there, probably is a problem here.
4429      */
4430     assert (src_image->drawable == mask_image->drawable);
4431 #endif
4432
4433     while (height--)
4434     {
4435         dst = dst_line;
4436         dst_line += dst_stride;
4437         src = src_line;
4438         src_line += src_stride;
4439         w = width;
4440
4441         /* call prefetch hint to optimize cache load*/
4442         cache_prefetch ((__m128i*)src);
4443         cache_prefetch ((__m128i*)dst);
4444
4445         while (w && (unsigned long)dst & 15)
4446         {
4447             s = *src++;
4448             d = *dst;
4449
4450             ms = unpack_32_1x64 (s);
4451
4452             *dst++ = pack_565_32_16 (
4453                 pack_1x64_32 (
4454                     over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4455             w--;
4456         }
4457
4458         /* call prefetch hint to optimize cache load*/
4459         cache_prefetch ((__m128i*)src);
4460         cache_prefetch ((__m128i*)dst);
4461
4462         while (w >= 8)
4463         {
4464             /* fill cache line with next memory */
4465             cache_prefetch_next ((__m128i*)src);
4466             cache_prefetch_next ((__m128i*)dst);
4467
4468             /* First round */
4469             xmm_src = load_128_unaligned ((__m128i*)src);
4470             xmm_dst = load_128_aligned  ((__m128i*)dst);
4471
4472             opaque = is_opaque (xmm_src);
4473             zero = is_zero (xmm_src);
4474
4475             unpack_565_128_4x128 (xmm_dst,
4476                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4477             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4478
4479             /* preload next round*/
4480             xmm_src = load_128_unaligned ((__m128i*)(src + 4));
4481
4482             if (opaque)
4483             {
4484                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4485                                      &xmm_dst0, &xmm_dst1);
4486             }
4487             else if (!zero)
4488             {
4489                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4490                                         &xmm_dst0, &xmm_dst1);
4491             }
4492
4493             /* Second round */
4494             opaque = is_opaque (xmm_src);
4495             zero = is_zero (xmm_src);
4496
4497             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4498
4499             if (opaque)
4500             {
4501                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4502                                      &xmm_dst2, &xmm_dst3);
4503             }
4504             else if (!zero)
4505             {
4506                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4507                                         &xmm_dst2, &xmm_dst3);
4508             }
4509
4510             save_128_aligned (
4511                 (__m128i*)dst, pack_565_4x128_128 (
4512                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4513
4514             w -= 8;
4515             src += 8;
4516             dst += 8;
4517         }
4518
4519         while (w)
4520         {
4521             s = *src++;
4522             d = *dst;
4523
4524             ms = unpack_32_1x64 (s);
4525
4526             *dst++ = pack_565_32_16 (
4527                 pack_1x64_32 (
4528                     over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
4529             w--;
4530         }
4531     }
4532
4533     _mm_empty ();
4534 }
4535
4536 /* -------------------------------------------------------------------------
4537  * composite_over_pixbuf_8888
4538  */
4539
4540 static void
4541 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
4542                                  pixman_op_t              op,
4543                                  pixman_image_t *         src_image,
4544                                  pixman_image_t *         mask_image,
4545                                  pixman_image_t *         dst_image,
4546                                  int32_t                  src_x,
4547                                  int32_t                  src_y,
4548                                  int32_t                  mask_x,
4549                                  int32_t                  mask_y,
4550                                  int32_t                  dest_x,
4551                                  int32_t                  dest_y,
4552                                  int32_t                  width,
4553                                  int32_t                  height)
4554 {
4555     uint32_t    *dst_line, *dst, d;
4556     uint32_t    *src_line, *src, s;
4557     int dst_stride, src_stride;
4558     uint16_t w;
4559     uint32_t opaque, zero;
4560
4561     __m128i xmm_src_lo, xmm_src_hi;
4562     __m128i xmm_dst_lo, xmm_dst_hi;
4563
4564     PIXMAN_IMAGE_GET_LINE (
4565         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4566     PIXMAN_IMAGE_GET_LINE (
4567         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4568
4569 #if 0
4570     /* FIXME
4571      *
4572      * I copy the code from MMX one and keep the fixme.
4573      * If it's a problem there, probably is a problem here.
4574      */
4575     assert (src_image->drawable == mask_image->drawable);
4576 #endif
4577
4578     while (height--)
4579     {
4580         dst = dst_line;
4581         dst_line += dst_stride;
4582         src = src_line;
4583         src_line += src_stride;
4584         w = width;
4585
4586         /* call prefetch hint to optimize cache load*/
4587         cache_prefetch ((__m128i*)src);
4588         cache_prefetch ((__m128i*)dst);
4589
4590         while (w && (unsigned long)dst & 15)
4591         {
4592             s = *src++;
4593             d = *dst;
4594
4595             *dst++ = pack_1x64_32 (
4596                 over_rev_non_pre_1x64 (
4597                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4598
4599             w--;
4600         }
4601
4602         /* call prefetch hint to optimize cache load*/
4603         cache_prefetch ((__m128i*)src);
4604         cache_prefetch ((__m128i*)dst);
4605
4606         while (w >= 4)
4607         {
4608             /* fill cache line with next memory */
4609             cache_prefetch_next ((__m128i*)src);
4610             cache_prefetch_next ((__m128i*)dst);
4611
4612             xmm_src_hi = load_128_unaligned ((__m128i*)src);
4613
4614             opaque = is_opaque (xmm_src_hi);
4615             zero = is_zero (xmm_src_hi);
4616
4617             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
4618
4619             if (opaque)
4620             {
4621                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4622                                      &xmm_dst_lo, &xmm_dst_hi);
4623
4624                 save_128_aligned (
4625                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4626             }
4627             else if (!zero)
4628             {
4629                 xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
4630
4631                 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4632
4633                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4634                                         &xmm_dst_lo, &xmm_dst_hi);
4635
4636                 save_128_aligned (
4637                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4638             }
4639
4640             w -= 4;
4641             dst += 4;
4642             src += 4;
4643         }
4644
4645         while (w)
4646         {
4647             s = *src++;
4648             d = *dst;
4649
4650             *dst++ = pack_1x64_32 (
4651                 over_rev_non_pre_1x64 (
4652                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4653
4654             w--;
4655         }
4656     }
4657
4658     _mm_empty ();
4659 }
4660
4661 /* -------------------------------------------------------------------------------------------------
4662  * composite_over_n_8888_0565_ca
4663  */
4664
4665 static void
4666 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
4667                                     pixman_op_t              op,
4668                                     pixman_image_t *         src_image,
4669                                     pixman_image_t *         mask_image,
4670                                     pixman_image_t *         dst_image,
4671                                     int32_t                  src_x,
4672                                     int32_t                  src_y,
4673                                     int32_t                  mask_x,
4674                                     int32_t                  mask_y,
4675                                     int32_t                  dest_x,
4676                                     int32_t                  dest_y,
4677                                     int32_t                  width,
4678                                     int32_t                  height)
4679 {
4680     uint32_t src;
4681     uint16_t    *dst_line, *dst, d;
4682     uint32_t    *mask_line, *mask, m;
4683     int dst_stride, mask_stride;
4684     int w;
4685     uint32_t pack_cmp;
4686
4687     __m128i xmm_src, xmm_alpha;
4688     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4689     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4690
4691     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4692
4693     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4694
4695     if (src == 0)
4696         return;
4697
4698     PIXMAN_IMAGE_GET_LINE (
4699         dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4700     PIXMAN_IMAGE_GET_LINE (
4701         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4702
4703     xmm_src = expand_pixel_32_1x128 (src);
4704     xmm_alpha = expand_alpha_1x128 (xmm_src);
4705     mmx_src = _mm_movepi64_pi64 (xmm_src);
4706     mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
4707
4708     while (height--)
4709     {
4710         w = width;
4711         mask = mask_line;
4712         dst = dst_line;
4713         mask_line += mask_stride;
4714         dst_line += dst_stride;
4715
4716         /* call prefetch hint to optimize cache load*/
4717         cache_prefetch ((__m128i*)mask);
4718         cache_prefetch ((__m128i*)dst);
4719
4720         while (w && ((unsigned long)dst & 15))
4721         {
4722             m = *(uint32_t *) mask;
4723
4724             if (m)
4725             {
4726                 d = *dst;
4727                 mmx_mask = unpack_32_1x64 (m);
4728                 mmx_dest = expand565_16_1x64 (d);
4729
4730                 *dst = pack_565_32_16 (
4731                     pack_1x64_32 (
4732                         in_over_1x64 (
4733                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4734             }
4735
4736             w--;
4737             dst++;
4738             mask++;
4739         }
4740
4741         /* call prefetch hint to optimize cache load*/
4742         cache_prefetch ((__m128i*)mask);
4743         cache_prefetch ((__m128i*)dst);
4744
4745         while (w >= 8)
4746         {
4747             /* fill cache line with next memory */
4748             cache_prefetch_next ((__m128i*)mask);
4749             cache_prefetch_next ((__m128i*)dst);
4750
4751             /* First round */
4752             xmm_mask = load_128_unaligned ((__m128i*)mask);
4753             xmm_dst = load_128_aligned ((__m128i*)dst);
4754
4755             pack_cmp = _mm_movemask_epi8 (
4756                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4757
4758             unpack_565_128_4x128 (xmm_dst,
4759                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4760             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4761
4762             /* preload next round */
4763             xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4764
4765             /* preload next round */
4766             if (pack_cmp != 0xffff)
4767             {
4768                 in_over_2x128 (&xmm_src, &xmm_src,
4769                                &xmm_alpha, &xmm_alpha,
4770                                &xmm_mask_lo, &xmm_mask_hi,
4771                                &xmm_dst0, &xmm_dst1);
4772             }
4773
4774             /* Second round */
4775             pack_cmp = _mm_movemask_epi8 (
4776                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4777
4778             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4779
4780             if (pack_cmp != 0xffff)
4781             {
4782                 in_over_2x128 (&xmm_src, &xmm_src,
4783                                &xmm_alpha, &xmm_alpha,
4784                                &xmm_mask_lo, &xmm_mask_hi,
4785                                &xmm_dst2, &xmm_dst3);
4786             }
4787
4788             save_128_aligned (
4789                 (__m128i*)dst, pack_565_4x128_128 (
4790                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4791
4792             w -= 8;
4793             dst += 8;
4794             mask += 8;
4795         }
4796
4797         while (w)
4798         {
4799             m = *(uint32_t *) mask;
4800
4801             if (m)
4802             {
4803                 d = *dst;
4804                 mmx_mask = unpack_32_1x64 (m);
4805                 mmx_dest = expand565_16_1x64 (d);
4806
4807                 *dst = pack_565_32_16 (
4808                     pack_1x64_32 (
4809                         in_over_1x64 (
4810                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4811             }
4812
4813             w--;
4814             dst++;
4815             mask++;
4816         }
4817     }
4818
4819     _mm_empty ();
4820 }
4821
4822 /* -----------------------------------------------------------------------
4823  * composite_in_n_8_8
4824  */
4825
4826 static void
4827 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4828                          pixman_op_t              op,
4829                          pixman_image_t *         src_image,
4830                          pixman_image_t *         mask_image,
4831                          pixman_image_t *         dst_image,
4832                          int32_t                  src_x,
4833                          int32_t                  src_y,
4834                          int32_t                  mask_x,
4835                          int32_t                  mask_y,
4836                          int32_t                  dest_x,
4837                          int32_t                  dest_y,
4838                          int32_t                  width,
4839                          int32_t                  height)
4840 {
4841     uint8_t     *dst_line, *dst;
4842     uint8_t     *mask_line, *mask;
4843     int dst_stride, mask_stride;
4844     uint16_t w, d, m;
4845     uint32_t src;
4846     uint8_t sa;
4847
4848     __m128i xmm_alpha;
4849     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4850     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4851
4852     PIXMAN_IMAGE_GET_LINE (
4853         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4854     PIXMAN_IMAGE_GET_LINE (
4855         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4856
4857     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
4858
4859     sa = src >> 24;
4860     if (sa == 0)
4861         return;
4862
4863     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4864
4865     while (height--)
4866     {
4867         dst = dst_line;
4868         dst_line += dst_stride;
4869         mask = mask_line;
4870         mask_line += mask_stride;
4871         w = width;
4872
4873         /* call prefetch hint to optimize cache load*/
4874         cache_prefetch ((__m128i*)mask);
4875         cache_prefetch ((__m128i*)dst);
4876
4877         while (w && ((unsigned long)dst & 15))
4878         {
4879             m = (uint32_t) *mask++;
4880             d = (uint32_t) *dst;
4881
4882             *dst++ = (uint8_t) pack_1x64_32 (
4883                 pix_multiply_1x64 (
4884                     pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
4885                                        unpack_32_1x64 (m)),
4886                     unpack_32_1x64 (d)));
4887             w--;
4888         }
4889
4890         /* call prefetch hint to optimize cache load*/
4891         cache_prefetch ((__m128i*)mask);
4892         cache_prefetch ((__m128i*)dst);
4893
4894         while (w >= 16)
4895         {
4896             /* fill cache line with next memory */
4897             cache_prefetch_next ((__m128i*)mask);
4898             cache_prefetch_next ((__m128i*)dst);
4899
4900             xmm_mask = load_128_unaligned ((__m128i*)mask);
4901             xmm_dst = load_128_aligned ((__m128i*)dst);
4902
4903             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4904             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4905
4906             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4907                                 &xmm_mask_lo, &xmm_mask_hi,
4908                                 &xmm_mask_lo, &xmm_mask_hi);
4909
4910             pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4911                                 &xmm_dst_lo, &xmm_dst_hi,
4912                                 &xmm_dst_lo, &xmm_dst_hi);
4913
4914             save_128_aligned (
4915                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4916
4917             mask += 16;
4918             dst += 16;
4919             w -= 16;
4920         }
4921
4922         while (w)
4923         {
4924             m = (uint32_t) *mask++;
4925             d = (uint32_t) *dst;
4926
4927             *dst++ = (uint8_t) pack_1x64_32 (
4928                 pix_multiply_1x64 (
4929                     pix_multiply_1x64 (
4930                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
4931                     unpack_32_1x64 (d)));
4932             w--;
4933         }
4934     }
4935
4936     _mm_empty ();
4937 }
4938
4939 /* ---------------------------------------------------------------------------
4940  * composite_in_8_8
4941  */
4942
4943 static void
4944 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4945                        pixman_op_t              op,
4946                        pixman_image_t *         src_image,
4947                        pixman_image_t *         mask_image,
4948                        pixman_image_t *         dst_image,
4949                        int32_t                  src_x,
4950                        int32_t                  src_y,
4951                        int32_t                  mask_x,
4952                        int32_t                  mask_y,
4953                        int32_t                  dest_x,
4954                        int32_t                  dest_y,
4955                        int32_t                  width,
4956                        int32_t                  height)
4957 {
4958     uint8_t     *dst_line, *dst;
4959     uint8_t     *src_line, *src;
4960     int src_stride, dst_stride;
4961     uint16_t w;
4962     uint32_t s, d;
4963
4964     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4965     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4966
4967     PIXMAN_IMAGE_GET_LINE (
4968         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4969     PIXMAN_IMAGE_GET_LINE (
4970         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4971
4972     while (height--)
4973     {
4974         dst = dst_line;
4975         dst_line += dst_stride;
4976         src = src_line;
4977         src_line += src_stride;
4978         w = width;
4979
4980         /* call prefetch hint to optimize cache load*/
4981         cache_prefetch ((__m128i*)src);
4982         cache_prefetch ((__m128i*)dst);
4983
4984         while (w && ((unsigned long)dst & 15))
4985         {
4986             s = (uint32_t) *src++;
4987             d = (uint32_t) *dst;
4988
4989             *dst++ = (uint8_t) pack_1x64_32 (
4990                 pix_multiply_1x64 (
4991                     unpack_32_1x64 (s), unpack_32_1x64 (d)));
4992             w--;
4993         }
4994
4995         /* call prefetch hint to optimize cache load*/
4996         cache_prefetch ((__m128i*)src);
4997         cache_prefetch ((__m128i*)dst);
4998
4999         while (w >= 16)
5000         {
5001             /* fill cache line with next memory */
5002             cache_prefetch_next ((__m128i*)src);
5003             cache_prefetch_next ((__m128i*)dst);
5004
5005             xmm_src = load_128_unaligned ((__m128i*)src);
5006             xmm_dst = load_128_aligned ((__m128i*)dst);
5007
5008             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5009             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5010
5011             pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
5012                                 &xmm_dst_lo, &xmm_dst_hi,
5013                                 &xmm_dst_lo, &xmm_dst_hi);
5014
5015             save_128_aligned (
5016                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5017
5018             src += 16;
5019             dst += 16;
5020             w -= 16;
5021         }
5022
5023         while (w)
5024         {
5025             s = (uint32_t) *src++;
5026             d = (uint32_t) *dst;
5027
5028             *dst++ = (uint8_t) pack_1x64_32 (
5029                 pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
5030             w--;
5031         }
5032     }
5033
5034     _mm_empty ();
5035 }
5036
5037 /* -------------------------------------------------------------------------
5038  * composite_add_8888_8_8
5039  */
5040
5041 static void
5042 sse2_composite_add_8888_8_8 (pixman_implementation_t *imp,
5043                              pixman_op_t              op,
5044                              pixman_image_t *         src_image,
5045                              pixman_image_t *         mask_image,
5046                              pixman_image_t *         dst_image,
5047                              int32_t                  src_x,
5048                              int32_t                  src_y,
5049                              int32_t                  mask_x,
5050                              int32_t                  mask_y,
5051                              int32_t                  dest_x,
5052                              int32_t                  dest_y,
5053                              int32_t                  width,
5054                              int32_t                  height)
5055 {
5056     uint8_t     *dst_line, *dst;
5057     uint8_t     *mask_line, *mask;
5058     int dst_stride, mask_stride;
5059     uint16_t w;
5060     uint32_t src;
5061     uint8_t sa;
5062     uint32_t m, d;
5063
5064     __m128i xmm_alpha;
5065     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5066     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5067
5068     PIXMAN_IMAGE_GET_LINE (
5069         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5070     PIXMAN_IMAGE_GET_LINE (
5071         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5072
5073     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
5074
5075     sa = src >> 24;
5076     if (sa == 0)
5077         return;
5078
5079     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
5080
5081     while (height--)
5082     {
5083         dst = dst_line;
5084         dst_line += dst_stride;
5085         mask = mask_line;
5086         mask_line += mask_stride;
5087         w = width;
5088
5089         /* call prefetch hint to optimize cache load*/
5090         cache_prefetch ((__m128i*)mask);
5091         cache_prefetch ((__m128i*)dst);
5092
5093         while (w && ((unsigned long)dst & 15))
5094         {
5095             m = (uint32_t) *mask++;
5096             d = (uint32_t) *dst;
5097
5098             *dst++ = (uint8_t) pack_1x64_32 (
5099                 _mm_adds_pu16 (
5100                     pix_multiply_1x64 (
5101                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5102                     unpack_32_1x64 (d)));
5103             w--;
5104         }
5105
5106         /* call prefetch hint to optimize cache load*/
5107         cache_prefetch ((__m128i*)mask);
5108         cache_prefetch ((__m128i*)dst);
5109
5110         while (w >= 16)
5111         {
5112             /* fill cache line with next memory */
5113             cache_prefetch_next ((__m128i*)mask);
5114             cache_prefetch_next ((__m128i*)dst);
5115
5116             xmm_mask = load_128_unaligned ((__m128i*)mask);
5117             xmm_dst = load_128_aligned ((__m128i*)dst);
5118
5119             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5120             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5121
5122             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
5123                                 &xmm_mask_lo, &xmm_mask_hi,
5124                                 &xmm_mask_lo, &xmm_mask_hi);
5125
5126             xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
5127             xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
5128
5129             save_128_aligned (
5130                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5131
5132             mask += 16;
5133             dst += 16;
5134             w -= 16;
5135         }
5136
5137         while (w)
5138         {
5139             m = (uint32_t) *mask++;
5140             d = (uint32_t) *dst;
5141
5142             *dst++ = (uint8_t) pack_1x64_32 (
5143                 _mm_adds_pu16 (
5144                     pix_multiply_1x64 (
5145                         _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
5146                     unpack_32_1x64 (d)));
5147
5148             w--;
5149         }
5150     }
5151
5152     _mm_empty ();
5153 }
5154
5155 /* ----------------------------------------------------------------------
5156  * composite_add_8000_8000
5157  */
5158
5159 static void
5160 sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
5161                               pixman_op_t              op,
5162                               pixman_image_t *         src_image,
5163                               pixman_image_t *         mask_image,
5164                               pixman_image_t *         dst_image,
5165                               int32_t                  src_x,
5166                               int32_t                  src_y,
5167                               int32_t                  mask_x,
5168                               int32_t                  mask_y,
5169                               int32_t                  dest_x,
5170                               int32_t                  dest_y,
5171                               int32_t                  width,
5172                               int32_t                  height)
5173 {
5174     uint8_t     *dst_line, *dst;
5175     uint8_t     *src_line, *src;
5176     int dst_stride, src_stride;
5177     uint16_t w;
5178     uint16_t t;
5179
5180     PIXMAN_IMAGE_GET_LINE (
5181         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
5182     PIXMAN_IMAGE_GET_LINE (
5183         dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
5184
5185     while (height--)
5186     {
5187         dst = dst_line;
5188         src = src_line;
5189
5190         /* call prefetch hint to optimize cache load*/
5191         cache_prefetch ((__m128i*)src);
5192         cache_prefetch ((__m128i*)dst);
5193
5194         dst_line += dst_stride;
5195         src_line += src_stride;
5196         w = width;
5197
5198         /* Small head */
5199         while (w && (unsigned long)dst & 3)
5200         {
5201             t = (*dst) + (*src++);
5202             *dst++ = t | (0 - (t >> 8));
5203             w--;
5204         }
5205
5206         core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
5207
5208         /* Small tail */
5209         dst += w & 0xfffc;
5210         src += w & 0xfffc;
5211
5212         w &= 3;
5213
5214         while (w)
5215         {
5216             t = (*dst) + (*src++);
5217             *dst++ = t | (0 - (t >> 8));
5218             w--;
5219         }
5220     }
5221
5222     _mm_empty ();
5223 }
5224
5225 /* ---------------------------------------------------------------------
5226  * composite_add_8888_8888
5227  */
5228 static void
5229 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
5230                               pixman_op_t              op,
5231                               pixman_image_t *         src_image,
5232                               pixman_image_t *         mask_image,
5233                               pixman_image_t *         dst_image,
5234                               int32_t                  src_x,
5235                               int32_t                  src_y,
5236                               int32_t                  mask_x,
5237                               int32_t                  mask_y,
5238                               int32_t                  dest_x,
5239                               int32_t                  dest_y,
5240                               int32_t                  width,
5241                               int32_t                  height)
5242 {
5243     uint32_t    *dst_line, *dst;
5244     uint32_t    *src_line, *src;
5245     int dst_stride, src_stride;
5246
5247     PIXMAN_IMAGE_GET_LINE (
5248         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5249     PIXMAN_IMAGE_GET_LINE (
5250         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5251
5252     while (height--)
5253     {
5254         dst = dst_line;
5255         dst_line += dst_stride;
5256         src = src_line;
5257         src_line += src_stride;
5258
5259         core_combine_add_u_sse2 (dst, src, NULL, width);
5260     }
5261
5262     _mm_empty ();
5263 }
5264
5265 /* -------------------------------------------------------------------------------------------------
5266  * sse2_composite_copy_area
5267  */
5268
5269 static pixman_bool_t
5270 pixman_blt_sse2 (uint32_t *src_bits,
5271                  uint32_t *dst_bits,
5272                  int       src_stride,
5273                  int       dst_stride,
5274                  int       src_bpp,
5275                  int       dst_bpp,
5276                  int       src_x,
5277                  int       src_y,
5278                  int       dst_x,
5279                  int       dst_y,
5280                  int       width,
5281                  int       height)
5282 {
5283     uint8_t *   src_bytes;
5284     uint8_t *   dst_bytes;
5285     int byte_width;
5286
5287     if (src_bpp != dst_bpp)
5288         return FALSE;
5289
5290     if (src_bpp == 16)
5291     {
5292         src_stride = src_stride * (int) sizeof (uint32_t) / 2;
5293         dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
5294         src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
5295         dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5296         byte_width = 2 * width;
5297         src_stride *= 2;
5298         dst_stride *= 2;
5299     }
5300     else if (src_bpp == 32)
5301     {
5302         src_stride = src_stride * (int) sizeof (uint32_t) / 4;
5303         dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
5304         src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
5305         dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
5306         byte_width = 4 * width;
5307         src_stride *= 4;
5308         dst_stride *= 4;
5309     }
5310     else
5311     {
5312         return FALSE;
5313     }
5314
5315     cache_prefetch ((__m128i*)src_bytes);
5316     cache_prefetch ((__m128i*)dst_bytes);
5317
5318     while (height--)
5319     {
5320         int w;
5321         uint8_t *s = src_bytes;
5322         uint8_t *d = dst_bytes;
5323         src_bytes += src_stride;
5324         dst_bytes += dst_stride;
5325         w = byte_width;
5326
5327         cache_prefetch_next ((__m128i*)s);
5328         cache_prefetch_next ((__m128i*)d);
5329
5330         while (w >= 2 && ((unsigned long)d & 3))
5331         {
5332             *(uint16_t *)d = *(uint16_t *)s;
5333             w -= 2;
5334             s += 2;
5335             d += 2;
5336         }
5337
5338         while (w >= 4 && ((unsigned long)d & 15))
5339         {
5340             *(uint32_t *)d = *(uint32_t *)s;
5341
5342             w -= 4;
5343             s += 4;
5344             d += 4;
5345         }
5346
5347         cache_prefetch_next ((__m128i*)s);
5348         cache_prefetch_next ((__m128i*)d);
5349
5350         while (w >= 64)
5351         {
5352             __m128i xmm0, xmm1, xmm2, xmm3;
5353
5354             /* 128 bytes ahead */
5355             cache_prefetch (((__m128i*)s) + 8);
5356             cache_prefetch (((__m128i*)d) + 8);
5357
5358             xmm0 = load_128_unaligned ((__m128i*)(s));
5359             xmm1 = load_128_unaligned ((__m128i*)(s + 16));
5360             xmm2 = load_128_unaligned ((__m128i*)(s + 32));
5361             xmm3 = load_128_unaligned ((__m128i*)(s + 48));
5362
5363             save_128_aligned ((__m128i*)(d),    xmm0);
5364             save_128_aligned ((__m128i*)(d + 16), xmm1);
5365             save_128_aligned ((__m128i*)(d + 32), xmm2);
5366             save_128_aligned ((__m128i*)(d + 48), xmm3);
5367
5368             s += 64;
5369             d += 64;
5370             w -= 64;
5371         }
5372
5373         cache_prefetch_next ((__m128i*)s);
5374         cache_prefetch_next ((__m128i*)d);
5375
5376         while (w >= 16)
5377         {
5378             save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
5379
5380             w -= 16;
5381             d += 16;
5382             s += 16;
5383         }
5384
5385         cache_prefetch_next ((__m128i*)s);
5386         cache_prefetch_next ((__m128i*)d);
5387
5388         while (w >= 4)
5389         {
5390             *(uint32_t *)d = *(uint32_t *)s;
5391
5392             w -= 4;
5393             s += 4;
5394             d += 4;
5395         }
5396
5397         if (w >= 2)
5398         {
5399             *(uint16_t *)d = *(uint16_t *)s;
5400             w -= 2;
5401             s += 2;
5402             d += 2;
5403         }
5404     }
5405
5406     _mm_empty ();
5407
5408     return TRUE;
5409 }
5410
5411 static void
5412 sse2_composite_copy_area (pixman_implementation_t *imp,
5413                           pixman_op_t              op,
5414                           pixman_image_t *         src_image,
5415                           pixman_image_t *         mask_image,
5416                           pixman_image_t *         dst_image,
5417                           int32_t                  src_x,
5418                           int32_t                  src_y,
5419                           int32_t                  mask_x,
5420                           int32_t                  mask_y,
5421                           int32_t                  dest_x,
5422                           int32_t                  dest_y,
5423                           int32_t                  width,
5424                           int32_t                  height)
5425 {
5426     pixman_blt_sse2 (src_image->bits.bits,
5427                      dst_image->bits.bits,
5428                      src_image->bits.rowstride,
5429                      dst_image->bits.rowstride,
5430                      PIXMAN_FORMAT_BPP (src_image->bits.format),
5431                      PIXMAN_FORMAT_BPP (dst_image->bits.format),
5432                      src_x, src_y, dest_x, dest_y, width, height);
5433 }
5434
5435 #if 0
5436 /* This code are buggy in MMX version, now the bug was translated to SSE2 version */
5437 void
5438 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
5439                                  pixman_op_t              op,
5440                                  pixman_image_t *         src_image,
5441                                  pixman_image_t *         mask_image,
5442                                  pixman_image_t *         dst_image,
5443                                  int32_t                  src_x,
5444                                  int32_t                  src_y,
5445                                  int32_t                  mask_x,
5446                                  int32_t                  mask_y,
5447                                  int32_t                  dest_x,
5448                                  int32_t                  dest_y,
5449                                  int32_t                  width,
5450                                  int32_t                  height)
5451 {
5452     uint32_t    *src, *src_line, s;
5453     uint32_t    *dst, *dst_line, d;
5454     uint8_t         *mask, *mask_line;
5455     uint32_t m;
5456     int src_stride, mask_stride, dst_stride;
5457     uint16_t w;
5458
5459     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5460     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5461     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5462
5463     PIXMAN_IMAGE_GET_LINE (
5464         dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5465     PIXMAN_IMAGE_GET_LINE (
5466         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5467     PIXMAN_IMAGE_GET_LINE (
5468         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5469
5470     while (height--)
5471     {
5472         src = src_line;
5473         src_line += src_stride;
5474         dst = dst_line;
5475         dst_line += dst_stride;
5476         mask = mask_line;
5477         mask_line += mask_stride;
5478
5479         w = width;
5480
5481         /* call prefetch hint to optimize cache load*/
5482         cache_prefetch ((__m128i*)src);
5483         cache_prefetch ((__m128i*)dst);
5484         cache_prefetch ((__m128i*)mask);
5485
5486         while (w && (unsigned long)dst & 15)
5487         {
5488             s = 0xff000000 | *src++;
5489             m = (uint32_t) *mask++;
5490             d = *dst;
5491
5492             __m64 ms = unpack_32_1x64 (s);
5493
5494             if (m != 0xff)
5495             {
5496                 ms = in_over_1x64 (ms,
5497                                    mask_x00ff,
5498                                    expand_alpha_rev_1x64 (unpack_32_1x64 (m)),
5499                                    unpack_32_1x64 (d));
5500             }
5501
5502             *dst++ = pack_1x64_32 (ms);
5503             w--;
5504         }
5505
5506         /* call prefetch hint to optimize cache load*/
5507         cache_prefetch ((__m128i*)src);
5508         cache_prefetch ((__m128i*)dst);
5509         cache_prefetch ((__m128i*)mask);
5510
5511         while (w >= 4)
5512         {
5513             /* fill cache line with next memory */
5514             cache_prefetch_next ((__m128i*)src);
5515             cache_prefetch_next ((__m128i*)dst);
5516             cache_prefetch_next ((__m128i*)mask);
5517
5518             m = *(uint32_t*) mask;
5519             xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
5520
5521             if (m == 0xffffffff)
5522             {
5523                 save_128_aligned ((__m128i*)dst, xmm_src);
5524             }
5525             else
5526             {
5527                 xmm_dst = load_128_aligned ((__m128i*)dst);
5528
5529                 xmm_mask = _mm_unpacklo_epi16 (
5530                     unpack_32_1x128 (m), _mm_setzero_si128 ());
5531
5532                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5533                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5534                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5535
5536                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
5537                                         &xmm_mask_lo, &xmm_mask_hi);
5538
5539                 in_over_2x128 (xmm_src_lo, xmm_src_hi,
5540                                mask_00ff, mask_00ff,
5541                                xmm_mask_lo, xmm_mask_hi,
5542                                &xmm_dst_lo, &xmm_dst_hi);
5543
5544                 save_128_aligned (
5545                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5546             }
5547
5548             src += 4;
5549             dst += 4;
5550             mask += 4;
5551             w -= 4;
5552         }
5553
5554         while (w)
5555         {
5556             m = (uint32_t) *mask++;
5557
5558             if (m)
5559             {
5560                 s = 0xff000000 | *src;
5561
5562                 if (m == 0xff)
5563                 {
5564                     *dst = s;
5565                 }
5566                 else
5567                 {
5568                     d = *dst;
5569
5570                     *dst = pack_1x64_32 (
5571                         in_over_1x64 (
5572                             unpack_32_1x64 (s),
5573                             mask_x00ff,
5574                             expand_alpha_rev_1x64 (unpack_32_1x64 (m)),
5575                             unpack_32_1x64 (d)));
5576                 }
5577
5578             }
5579
5580             src++;
5581             dst++;
5582             w--;
5583         }
5584     }
5585
5586     _mm_empty ();
5587 }
5588
5589 #endif
5590
5591 static const pixman_fast_path_t sse2_fast_paths[] =
5592 {
5593     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   sse2_composite_over_n_8_0565,       0 },
5594     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   sse2_composite_over_n_8_0565,       0 },
5595     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_over_n_8888,         0 },
5596     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_over_n_8888,         0 },
5597     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_r5g6b5,   sse2_composite_over_n_0565,         0 },
5598     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_over_8888_8888,      0 },
5599     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_over_8888_8888,      0 },
5600     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, sse2_composite_over_8888_8888,      0 },
5601     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_over_8888_8888,      0 },
5602     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   sse2_composite_over_8888_0565,      0 },
5603     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   sse2_composite_over_8888_0565,      0 },
5604     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_n_8_8888,       0 },
5605     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_n_8_8888,       0 },
5606     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_n_8_8888,       0 },
5607     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_n_8_8888,       0 },
5608 #if 0
5609     /* FIXME: This code are buggy in MMX version, now the bug was translated to SSE2 version */
5610     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_x888_8_8888,    0 },
5611     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888,    0 },
5612     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_x888_8_8888,    0 },
5613     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888,    0 },
5614 #endif
5615     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
5616     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
5617     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
5618     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
5619     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
5620     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
5621     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
5622     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
5623     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5624     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5625     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5626     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
5627     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_r5g6b5,   sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
5628     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_b5g6r5,   sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
5629     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5630     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5631     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5632     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5633     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5634     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5635     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5636     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
5637     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
5638     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
5639     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
5640     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
5641     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_copy_area,           0 },
5642     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_copy_area,           0 },
5643
5644     { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_add_n_8888_8888_ca,  NEED_COMPONENT_ALPHA },
5645     { PIXMAN_OP_ADD,  PIXMAN_a8,       PIXMAN_null,     PIXMAN_a8,       sse2_composite_add_8000_8000,       0 },
5646     { PIXMAN_OP_ADD,  PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_add_8888_8888,       0 },
5647     { PIXMAN_OP_ADD,  PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, sse2_composite_add_8888_8888,       0 },
5648     { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8,       sse2_composite_add_8888_8_8,        0 },
5649
5650     { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_src_n_8_8888,        0 },
5651     { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_src_n_8_8888,        0 },
5652     { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_src_n_8_8888,        0 },
5653     { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_src_n_8_8888,        0 },
5654     { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_copy_area,           0 },
5655     { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_a8b8g8r8, sse2_composite_copy_area,           0 },
5656     { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_copy_area,           0 },
5657     { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_copy_area,           0 },
5658     { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_copy_area,           0 },
5659     { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8,  PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_copy_area,           0 },
5660     { PIXMAN_OP_SRC, PIXMAN_r5g6b5,    PIXMAN_null,     PIXMAN_r5g6b5,   sse2_composite_copy_area,           0 },
5661     { PIXMAN_OP_SRC, PIXMAN_b5g6r5,    PIXMAN_null,     PIXMAN_b5g6r5,   sse2_composite_copy_area,           0 },
5662
5663     { PIXMAN_OP_IN,  PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       sse2_composite_in_8_8,              0 },
5664     { PIXMAN_OP_IN,  PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8,       sse2_composite_in_n_8_8,            0 },
5665
5666     { PIXMAN_OP_NONE },
5667 };
5668
5669 /*
5670  * Work around GCC bug causing crashes in Mozilla with SSE2
5671  *
5672  * When using -msse, gcc generates movdqa instructions assuming that
5673  * the stack is 16 byte aligned. Unfortunately some applications, such
5674  * as Mozilla and Mono, end up aligning the stack to 4 bytes, which
5675  * causes the movdqa instructions to fail.
5676  *
5677  * The __force_align_arg_pointer__ makes gcc generate a prologue that
5678  * realigns the stack pointer to 16 bytes.
5679  *
5680  * On x86-64 this is not necessary because the standard ABI already
5681  * calls for a 16 byte aligned stack.
5682  *
5683  * See https://bugs.freedesktop.org/show_bug.cgi?id=15693
5684  */
5685 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5686 __attribute__((__force_align_arg_pointer__))
5687 #endif
5688 static void
5689 sse2_composite (pixman_implementation_t *imp,
5690                 pixman_op_t              op,
5691                 pixman_image_t *         src,
5692                 pixman_image_t *         mask,
5693                 pixman_image_t *         dest,
5694                 int32_t                  src_x,
5695                 int32_t                  src_y,
5696                 int32_t                  mask_x,
5697                 int32_t                  mask_y,
5698                 int32_t                  dest_x,
5699                 int32_t                  dest_y,
5700                 int32_t                  width,
5701                 int32_t                  height)
5702 {
5703     if (_pixman_run_fast_path (sse2_fast_paths, imp,
5704                                op, src, mask, dest,
5705                                src_x, src_y,
5706                                mask_x, mask_y,
5707                                dest_x, dest_y,
5708                                width, height))
5709     {
5710         return;
5711     }
5712
5713     _pixman_implementation_composite (imp->delegate, op,
5714                                       src, mask, dest,
5715                                       src_x, src_y,
5716                                       mask_x, mask_y,
5717                                       dest_x, dest_y,
5718                                       width, height);
5719 }
5720
5721 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5722 __attribute__((__force_align_arg_pointer__))
5723 #endif
5724 static pixman_bool_t
5725 sse2_blt (pixman_implementation_t *imp,
5726           uint32_t *               src_bits,
5727           uint32_t *               dst_bits,
5728           int                      src_stride,
5729           int                      dst_stride,
5730           int                      src_bpp,
5731           int                      dst_bpp,
5732           int                      src_x,
5733           int                      src_y,
5734           int                      dst_x,
5735           int                      dst_y,
5736           int                      width,
5737           int                      height)
5738 {
5739     if (!pixman_blt_sse2 (
5740             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5741             src_x, src_y, dst_x, dst_y, width, height))
5742
5743     {
5744         return _pixman_implementation_blt (
5745             imp->delegate,
5746             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5747             src_x, src_y, dst_x, dst_y, width, height);
5748     }
5749
5750     return TRUE;
5751 }
5752
5753 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5754 __attribute__((__force_align_arg_pointer__))
5755 #endif
5756 static pixman_bool_t
5757 sse2_fill (pixman_implementation_t *imp,
5758            uint32_t *               bits,
5759            int                      stride,
5760            int                      bpp,
5761            int                      x,
5762            int                      y,
5763            int                      width,
5764            int                      height,
5765            uint32_t xor)
5766 {
5767     if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
5768     {
5769         return _pixman_implementation_fill (
5770             imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5771     }
5772
5773     return TRUE;
5774 }
5775
5776 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5777 __attribute__((__force_align_arg_pointer__))
5778 #endif
5779 pixman_implementation_t *
5780 _pixman_implementation_create_sse2 (void)
5781 {
5782     pixman_implementation_t *mmx = _pixman_implementation_create_mmx ();
5783     pixman_implementation_t *imp = _pixman_implementation_create (mmx);
5784
5785     /* SSE2 constants */
5786     mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5787     mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
5788     mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
5789     mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
5790     mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
5791     mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
5792     mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
5793     mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
5794     mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
5795     mask_0080 = create_mask_16_128 (0x0080);
5796     mask_00ff = create_mask_16_128 (0x00ff);
5797     mask_0101 = create_mask_16_128 (0x0101);
5798     mask_ffff = create_mask_16_128 (0xffff);
5799     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
5800     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
5801
5802     /* MMX constants */
5803     mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
5804     mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
5805
5806     mask_x0080 = create_mask_16_64 (0x0080);
5807     mask_x00ff = create_mask_16_64 (0x00ff);
5808     mask_x0101 = create_mask_16_64 (0x0101);
5809     mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
5810
5811     _mm_empty ();
5812
5813     /* Set up function pointers */
5814
5815     /* SSE code patch for fbcompose.c */
5816     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
5817     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
5818     imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
5819     imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
5820     imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
5821     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
5822     imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
5823     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
5824     imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
5825     imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
5826
5827     imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
5828
5829     imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
5830     imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
5831     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
5832     imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
5833     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
5834     imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
5835     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
5836     imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
5837     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
5838     imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
5839     imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
5840
5841     imp->composite = sse2_composite;
5842     imp->blt = sse2_blt;
5843     imp->fill = sse2_fill;
5844
5845     return imp;
5846 }
5847
5848 #endif /* USE_SSE2 */